1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NEON
3 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SVE
4 ; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
6 define <8 x i16> @smull_v8i8_v8i16(ptr %A, ptr %B) nounwind {
7 ; CHECK-LABEL: smull_v8i8_v8i16:
9 ; CHECK-NEXT: ldr d0, [x0]
10 ; CHECK-NEXT: ldr d1, [x1]
11 ; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
13 %tmp1 = load <8 x i8>, ptr %A
14 %tmp2 = load <8 x i8>, ptr %B
15 %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
16 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
17 %tmp5 = mul <8 x i16> %tmp3, %tmp4
21 define <4 x i32> @smull_v4i16_v4i32(ptr %A, ptr %B) nounwind {
22 ; CHECK-LABEL: smull_v4i16_v4i32:
24 ; CHECK-NEXT: ldr d0, [x0]
25 ; CHECK-NEXT: ldr d1, [x1]
26 ; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
28 %tmp1 = load <4 x i16>, ptr %A
29 %tmp2 = load <4 x i16>, ptr %B
30 %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
31 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
32 %tmp5 = mul <4 x i32> %tmp3, %tmp4
36 define <2 x i64> @smull_v2i32_v2i64(ptr %A, ptr %B) nounwind {
37 ; CHECK-LABEL: smull_v2i32_v2i64:
39 ; CHECK-NEXT: ldr d0, [x0]
40 ; CHECK-NEXT: ldr d1, [x1]
41 ; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
43 %tmp1 = load <2 x i32>, ptr %A
44 %tmp2 = load <2 x i32>, ptr %B
45 %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
46 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
47 %tmp5 = mul <2 x i64> %tmp3, %tmp4
51 define <8 x i32> @smull_zext_v8i8_v8i32(ptr %A, ptr %B) nounwind {
52 ; CHECK-NEON-LABEL: smull_zext_v8i8_v8i32:
53 ; CHECK-NEON: // %bb.0:
54 ; CHECK-NEON-NEXT: ldr d0, [x0]
55 ; CHECK-NEON-NEXT: ldr q2, [x1]
56 ; CHECK-NEON-NEXT: ushll v0.8h, v0.8b, #0
57 ; CHECK-NEON-NEXT: smull2 v1.4s, v0.8h, v2.8h
58 ; CHECK-NEON-NEXT: smull v0.4s, v0.4h, v2.4h
59 ; CHECK-NEON-NEXT: ret
61 ; CHECK-SVE-LABEL: smull_zext_v8i8_v8i32:
62 ; CHECK-SVE: // %bb.0:
63 ; CHECK-SVE-NEXT: ldr d0, [x0]
64 ; CHECK-SVE-NEXT: ldr q2, [x1]
65 ; CHECK-SVE-NEXT: ushll v0.8h, v0.8b, #0
66 ; CHECK-SVE-NEXT: smull2 v1.4s, v0.8h, v2.8h
67 ; CHECK-SVE-NEXT: smull v0.4s, v0.4h, v2.4h
70 ; CHECK-GI-LABEL: smull_zext_v8i8_v8i32:
72 ; CHECK-GI-NEXT: ldr d0, [x0]
73 ; CHECK-GI-NEXT: ldr q1, [x1]
74 ; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
75 ; CHECK-GI-NEXT: ushll v2.4s, v0.4h, #0
76 ; CHECK-GI-NEXT: ushll2 v3.4s, v0.8h, #0
77 ; CHECK-GI-NEXT: sshll v0.4s, v1.4h, #0
78 ; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0
79 ; CHECK-GI-NEXT: mul v0.4s, v2.4s, v0.4s
80 ; CHECK-GI-NEXT: mul v1.4s, v3.4s, v1.4s
82 %load.A = load <8 x i8>, ptr %A
83 %load.B = load <8 x i16>, ptr %B
84 %zext.A = zext <8 x i8> %load.A to <8 x i32>
85 %sext.B = sext <8 x i16> %load.B to <8 x i32>
86 %res = mul <8 x i32> %zext.A, %sext.B
90 define <8 x i32> @smull_zext_v8i8_v8i32_sext_first_operand(ptr %A, ptr %B) nounwind {
91 ; CHECK-NEON-LABEL: smull_zext_v8i8_v8i32_sext_first_operand:
92 ; CHECK-NEON: // %bb.0:
93 ; CHECK-NEON-NEXT: ldr d0, [x1]
94 ; CHECK-NEON-NEXT: ldr q2, [x0]
95 ; CHECK-NEON-NEXT: ushll v0.8h, v0.8b, #0
96 ; CHECK-NEON-NEXT: smull2 v1.4s, v2.8h, v0.8h
97 ; CHECK-NEON-NEXT: smull v0.4s, v2.4h, v0.4h
98 ; CHECK-NEON-NEXT: ret
100 ; CHECK-SVE-LABEL: smull_zext_v8i8_v8i32_sext_first_operand:
101 ; CHECK-SVE: // %bb.0:
102 ; CHECK-SVE-NEXT: ldr d0, [x1]
103 ; CHECK-SVE-NEXT: ldr q2, [x0]
104 ; CHECK-SVE-NEXT: ushll v0.8h, v0.8b, #0
105 ; CHECK-SVE-NEXT: smull2 v1.4s, v2.8h, v0.8h
106 ; CHECK-SVE-NEXT: smull v0.4s, v2.4h, v0.4h
107 ; CHECK-SVE-NEXT: ret
109 ; CHECK-GI-LABEL: smull_zext_v8i8_v8i32_sext_first_operand:
110 ; CHECK-GI: // %bb.0:
111 ; CHECK-GI-NEXT: ldr d0, [x1]
112 ; CHECK-GI-NEXT: ldr q1, [x0]
113 ; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
114 ; CHECK-GI-NEXT: sshll v2.4s, v1.4h, #0
115 ; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0
116 ; CHECK-GI-NEXT: ushll v3.4s, v0.4h, #0
117 ; CHECK-GI-NEXT: ushll2 v4.4s, v0.8h, #0
118 ; CHECK-GI-NEXT: mul v0.4s, v2.4s, v3.4s
119 ; CHECK-GI-NEXT: mul v1.4s, v1.4s, v4.4s
121 %load.A = load <8 x i16>, ptr %A
122 %load.B = load <8 x i8>, ptr %B
123 %sext.A = sext <8 x i16> %load.A to <8 x i32>
124 %zext.B = zext <8 x i8> %load.B to <8 x i32>
125 %res = mul <8 x i32> %sext.A, %zext.B
129 define <8 x i32> @smull_zext_v8i8_v8i32_top_bit_is_1(ptr %A, ptr %B) nounwind {
130 ; CHECK-NEON-LABEL: smull_zext_v8i8_v8i32_top_bit_is_1:
131 ; CHECK-NEON: // %bb.0:
132 ; CHECK-NEON-NEXT: ldr q0, [x0]
133 ; CHECK-NEON-NEXT: ldr q1, [x1]
134 ; CHECK-NEON-NEXT: orr v0.8h, #128, lsl #8
135 ; CHECK-NEON-NEXT: sshll v3.4s, v1.4h, #0
136 ; CHECK-NEON-NEXT: sshll2 v1.4s, v1.8h, #0
137 ; CHECK-NEON-NEXT: ushll v2.4s, v0.4h, #0
138 ; CHECK-NEON-NEXT: ushll2 v0.4s, v0.8h, #0
139 ; CHECK-NEON-NEXT: mul v1.4s, v0.4s, v1.4s
140 ; CHECK-NEON-NEXT: mul v0.4s, v2.4s, v3.4s
141 ; CHECK-NEON-NEXT: ret
143 ; CHECK-SVE-LABEL: smull_zext_v8i8_v8i32_top_bit_is_1:
144 ; CHECK-SVE: // %bb.0:
145 ; CHECK-SVE-NEXT: ldr q0, [x0]
146 ; CHECK-SVE-NEXT: ldr q1, [x1]
147 ; CHECK-SVE-NEXT: orr v0.8h, #128, lsl #8
148 ; CHECK-SVE-NEXT: sshll v3.4s, v1.4h, #0
149 ; CHECK-SVE-NEXT: sshll2 v1.4s, v1.8h, #0
150 ; CHECK-SVE-NEXT: ushll v2.4s, v0.4h, #0
151 ; CHECK-SVE-NEXT: ushll2 v0.4s, v0.8h, #0
152 ; CHECK-SVE-NEXT: mul v1.4s, v0.4s, v1.4s
153 ; CHECK-SVE-NEXT: mul v0.4s, v2.4s, v3.4s
154 ; CHECK-SVE-NEXT: ret
156 ; CHECK-GI-LABEL: smull_zext_v8i8_v8i32_top_bit_is_1:
157 ; CHECK-GI: // %bb.0:
158 ; CHECK-GI-NEXT: movi v0.8h, #128, lsl #8
159 ; CHECK-GI-NEXT: ldr q1, [x0]
160 ; CHECK-GI-NEXT: orr v0.16b, v1.16b, v0.16b
161 ; CHECK-GI-NEXT: ldr q1, [x1]
162 ; CHECK-GI-NEXT: ushll v2.4s, v0.4h, #0
163 ; CHECK-GI-NEXT: ushll2 v3.4s, v0.8h, #0
164 ; CHECK-GI-NEXT: sshll v0.4s, v1.4h, #0
165 ; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0
166 ; CHECK-GI-NEXT: mul v0.4s, v2.4s, v0.4s
167 ; CHECK-GI-NEXT: mul v1.4s, v3.4s, v1.4s
169 %load.A = load <8 x i16>, ptr %A
170 %or.A = or <8 x i16> %load.A, <i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000>
171 %load.B = load <8 x i16>, ptr %B
172 %zext.A = zext <8 x i16> %or.A to <8 x i32>
173 %sext.B = sext <8 x i16> %load.B to <8 x i32>
174 %res = mul <8 x i32> %zext.A, %sext.B
178 define <4 x i32> @smull_zext_v4i16_v4i32(ptr %A, ptr %B) nounwind {
179 ; CHECK-LABEL: smull_zext_v4i16_v4i32:
181 ; CHECK-NEXT: ldr s0, [x0]
182 ; CHECK-NEXT: ldr d1, [x1]
183 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
184 ; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
186 %load.A = load <4 x i8>, ptr %A
187 %load.B = load <4 x i16>, ptr %B
188 %zext.A = zext <4 x i8> %load.A to <4 x i32>
189 %sext.B = sext <4 x i16> %load.B to <4 x i32>
190 %res = mul <4 x i32> %zext.A, %sext.B
194 define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
195 ; CHECK-NEON-LABEL: smull_zext_v2i32_v2i64:
196 ; CHECK-NEON: // %bb.0:
197 ; CHECK-NEON-NEXT: ldr d0, [x1]
198 ; CHECK-NEON-NEXT: ldrh w9, [x0]
199 ; CHECK-NEON-NEXT: ldrh w10, [x0, #2]
200 ; CHECK-NEON-NEXT: sshll v0.2d, v0.2s, #0
201 ; CHECK-NEON-NEXT: fmov x11, d0
202 ; CHECK-NEON-NEXT: mov x8, v0.d[1]
203 ; CHECK-NEON-NEXT: smull x9, w9, w11
204 ; CHECK-NEON-NEXT: smull x8, w10, w8
205 ; CHECK-NEON-NEXT: fmov d0, x9
206 ; CHECK-NEON-NEXT: mov v0.d[1], x8
207 ; CHECK-NEON-NEXT: ret
209 ; CHECK-SVE-LABEL: smull_zext_v2i32_v2i64:
210 ; CHECK-SVE: // %bb.0:
211 ; CHECK-SVE-NEXT: ldrh w8, [x0]
212 ; CHECK-SVE-NEXT: ptrue p0.d, vl2
213 ; CHECK-SVE-NEXT: ldrh w9, [x0, #2]
214 ; CHECK-SVE-NEXT: ldr d0, [x1]
215 ; CHECK-SVE-NEXT: fmov d1, x8
216 ; CHECK-SVE-NEXT: sshll v0.2d, v0.2s, #0
217 ; CHECK-SVE-NEXT: mov v1.d[1], x9
218 ; CHECK-SVE-NEXT: mul z0.d, p0/m, z0.d, z1.d
219 ; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 killed $z0
220 ; CHECK-SVE-NEXT: ret
222 ; CHECK-GI-LABEL: smull_zext_v2i32_v2i64:
223 ; CHECK-GI: // %bb.0:
224 ; CHECK-GI-NEXT: ldr h1, [x0]
225 ; CHECK-GI-NEXT: ldr h2, [x0, #2]
226 ; CHECK-GI-NEXT: movi d0, #0x00ffff0000ffff
227 ; CHECK-GI-NEXT: mov v1.s[1], v2.s[0]
228 ; CHECK-GI-NEXT: and v0.8b, v1.8b, v0.8b
229 ; CHECK-GI-NEXT: mov s1, v0.s[1]
230 ; CHECK-GI-NEXT: fmov w8, s0
231 ; CHECK-GI-NEXT: ldr d0, [x1]
232 ; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #0
233 ; CHECK-GI-NEXT: fmov w9, s1
234 ; CHECK-GI-NEXT: fmov d1, x8
235 ; CHECK-GI-NEXT: mov d3, v0.d[1]
236 ; CHECK-GI-NEXT: mov v1.d[1], x9
237 ; CHECK-GI-NEXT: fmov x9, d0
238 ; CHECK-GI-NEXT: fmov x10, d3
239 ; CHECK-GI-NEXT: mov d2, v1.d[1]
240 ; CHECK-GI-NEXT: fmov x8, d1
241 ; CHECK-GI-NEXT: mul x8, x8, x9
242 ; CHECK-GI-NEXT: fmov x9, d2
243 ; CHECK-GI-NEXT: mul x9, x9, x10
244 ; CHECK-GI-NEXT: fmov d0, x8
245 ; CHECK-GI-NEXT: mov v0.d[1], x9
247 %load.A = load <2 x i16>, ptr %A
248 %load.B = load <2 x i32>, ptr %B
249 %zext.A = zext <2 x i16> %load.A to <2 x i64>
250 %sext.B = sext <2 x i32> %load.B to <2 x i64>
251 %res = mul <2 x i64> %zext.A, %sext.B
255 define <2 x i64> @smull_zext_and_v2i32_v2i64(ptr %A, ptr %B) nounwind {
256 ; CHECK-NEON-LABEL: smull_zext_and_v2i32_v2i64:
257 ; CHECK-NEON: // %bb.0:
258 ; CHECK-NEON-NEXT: ldr d0, [x0]
259 ; CHECK-NEON-NEXT: ldr d1, [x1]
260 ; CHECK-NEON-NEXT: bic v0.2s, #128, lsl #24
261 ; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s
262 ; CHECK-NEON-NEXT: ret
264 ; CHECK-SVE-LABEL: smull_zext_and_v2i32_v2i64:
265 ; CHECK-SVE: // %bb.0:
266 ; CHECK-SVE-NEXT: ldr d0, [x0]
267 ; CHECK-SVE-NEXT: ldr d1, [x1]
268 ; CHECK-SVE-NEXT: bic v0.2s, #128, lsl #24
269 ; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s
270 ; CHECK-SVE-NEXT: ret
272 ; CHECK-GI-LABEL: smull_zext_and_v2i32_v2i64:
273 ; CHECK-GI: // %bb.0:
274 ; CHECK-GI-NEXT: mvni v0.2s, #128, lsl #24
275 ; CHECK-GI-NEXT: ldr d1, [x0]
276 ; CHECK-GI-NEXT: and v0.8b, v1.8b, v0.8b
277 ; CHECK-GI-NEXT: ldr d1, [x1]
278 ; CHECK-GI-NEXT: sshll v1.2d, v1.2s, #0
279 ; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
280 ; CHECK-GI-NEXT: mov d3, v1.d[1]
281 ; CHECK-GI-NEXT: fmov x9, d1
282 ; CHECK-GI-NEXT: mov d2, v0.d[1]
283 ; CHECK-GI-NEXT: fmov x8, d0
284 ; CHECK-GI-NEXT: mul x8, x8, x9
285 ; CHECK-GI-NEXT: fmov x10, d3
286 ; CHECK-GI-NEXT: fmov x9, d2
287 ; CHECK-GI-NEXT: mul x9, x9, x10
288 ; CHECK-GI-NEXT: fmov d0, x8
289 ; CHECK-GI-NEXT: mov v0.d[1], x9
291 %load.A = load <2 x i32>, ptr %A
292 %and.A = and <2 x i32> %load.A, <i32 u0x7FFFFFFF, i32 u0x7FFFFFFF>
293 %load.B = load <2 x i32>, ptr %B
294 %zext.A = zext <2 x i32> %and.A to <2 x i64>
295 %sext.B = sext <2 x i32> %load.B to <2 x i64>
296 %res = mul <2 x i64> %zext.A, %sext.B
300 define <8 x i16> @umull_v8i8_v8i16(ptr %A, ptr %B) nounwind {
301 ; CHECK-LABEL: umull_v8i8_v8i16:
303 ; CHECK-NEXT: ldr d0, [x0]
304 ; CHECK-NEXT: ldr d1, [x1]
305 ; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
307 %tmp1 = load <8 x i8>, ptr %A
308 %tmp2 = load <8 x i8>, ptr %B
309 %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
310 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
311 %tmp5 = mul <8 x i16> %tmp3, %tmp4
315 define <4 x i32> @umull_v4i16_v4i32(ptr %A, ptr %B) nounwind {
316 ; CHECK-LABEL: umull_v4i16_v4i32:
318 ; CHECK-NEXT: ldr d0, [x0]
319 ; CHECK-NEXT: ldr d1, [x1]
320 ; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
322 %tmp1 = load <4 x i16>, ptr %A
323 %tmp2 = load <4 x i16>, ptr %B
324 %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
325 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
326 %tmp5 = mul <4 x i32> %tmp3, %tmp4
330 define <2 x i64> @umull_v2i32_v2i64(ptr %A, ptr %B) nounwind {
331 ; CHECK-LABEL: umull_v2i32_v2i64:
333 ; CHECK-NEXT: ldr d0, [x0]
334 ; CHECK-NEXT: ldr d1, [x1]
335 ; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
337 %tmp1 = load <2 x i32>, ptr %A
338 %tmp2 = load <2 x i32>, ptr %B
339 %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
340 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
341 %tmp5 = mul <2 x i64> %tmp3, %tmp4
345 define <8 x i16> @amull_v8i8_v8i16(ptr %A, ptr %B) nounwind {
346 ; CHECK-NEON-LABEL: amull_v8i8_v8i16:
347 ; CHECK-NEON: // %bb.0:
348 ; CHECK-NEON-NEXT: ldr d0, [x0]
349 ; CHECK-NEON-NEXT: ldr d1, [x1]
350 ; CHECK-NEON-NEXT: smull v0.8h, v0.8b, v1.8b
351 ; CHECK-NEON-NEXT: bic v0.8h, #255, lsl #8
352 ; CHECK-NEON-NEXT: ret
354 ; CHECK-SVE-LABEL: amull_v8i8_v8i16:
355 ; CHECK-SVE: // %bb.0:
356 ; CHECK-SVE-NEXT: ldr d0, [x0]
357 ; CHECK-SVE-NEXT: ldr d1, [x1]
358 ; CHECK-SVE-NEXT: smull v0.8h, v0.8b, v1.8b
359 ; CHECK-SVE-NEXT: bic v0.8h, #255, lsl #8
360 ; CHECK-SVE-NEXT: ret
362 ; CHECK-GI-LABEL: amull_v8i8_v8i16:
363 ; CHECK-GI: // %bb.0:
364 ; CHECK-GI-NEXT: ldr d1, [x0]
365 ; CHECK-GI-NEXT: ldr d2, [x1]
366 ; CHECK-GI-NEXT: movi v0.2d, #0xff00ff00ff00ff
367 ; CHECK-GI-NEXT: umull v1.8h, v1.8b, v2.8b
368 ; CHECK-GI-NEXT: and v0.16b, v1.16b, v0.16b
370 %tmp1 = load <8 x i8>, ptr %A
371 %tmp2 = load <8 x i8>, ptr %B
372 %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
373 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
374 %tmp5 = mul <8 x i16> %tmp3, %tmp4
375 %and = and <8 x i16> %tmp5, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
379 define <4 x i32> @amull_v4i16_v4i32(ptr %A, ptr %B) nounwind {
380 ; CHECK-NEON-LABEL: amull_v4i16_v4i32:
381 ; CHECK-NEON: // %bb.0:
382 ; CHECK-NEON-NEXT: ldr d1, [x0]
383 ; CHECK-NEON-NEXT: ldr d2, [x1]
384 ; CHECK-NEON-NEXT: movi v0.2d, #0x00ffff0000ffff
385 ; CHECK-NEON-NEXT: smull v1.4s, v1.4h, v2.4h
386 ; CHECK-NEON-NEXT: and v0.16b, v1.16b, v0.16b
387 ; CHECK-NEON-NEXT: ret
389 ; CHECK-SVE-LABEL: amull_v4i16_v4i32:
390 ; CHECK-SVE: // %bb.0:
391 ; CHECK-SVE-NEXT: ldr d1, [x0]
392 ; CHECK-SVE-NEXT: ldr d2, [x1]
393 ; CHECK-SVE-NEXT: movi v0.2d, #0x00ffff0000ffff
394 ; CHECK-SVE-NEXT: smull v1.4s, v1.4h, v2.4h
395 ; CHECK-SVE-NEXT: and v0.16b, v1.16b, v0.16b
396 ; CHECK-SVE-NEXT: ret
398 ; CHECK-GI-LABEL: amull_v4i16_v4i32:
399 ; CHECK-GI: // %bb.0:
400 ; CHECK-GI-NEXT: ldr d1, [x0]
401 ; CHECK-GI-NEXT: ldr d2, [x1]
402 ; CHECK-GI-NEXT: movi v0.2d, #0x00ffff0000ffff
403 ; CHECK-GI-NEXT: umull v1.4s, v1.4h, v2.4h
404 ; CHECK-GI-NEXT: and v0.16b, v1.16b, v0.16b
406 %tmp1 = load <4 x i16>, ptr %A
407 %tmp2 = load <4 x i16>, ptr %B
408 %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
409 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
410 %tmp5 = mul <4 x i32> %tmp3, %tmp4
411 %and = and <4 x i32> %tmp5, <i32 65535, i32 65535, i32 65535, i32 65535>
415 define <2 x i64> @amull_v2i32_v2i64(ptr %A, ptr %B) nounwind {
416 ; CHECK-NEON-LABEL: amull_v2i32_v2i64:
417 ; CHECK-NEON: // %bb.0:
418 ; CHECK-NEON-NEXT: ldr d1, [x0]
419 ; CHECK-NEON-NEXT: ldr d2, [x1]
420 ; CHECK-NEON-NEXT: movi v0.2d, #0x000000ffffffff
421 ; CHECK-NEON-NEXT: smull v1.2d, v1.2s, v2.2s
422 ; CHECK-NEON-NEXT: and v0.16b, v1.16b, v0.16b
423 ; CHECK-NEON-NEXT: ret
425 ; CHECK-SVE-LABEL: amull_v2i32_v2i64:
426 ; CHECK-SVE: // %bb.0:
427 ; CHECK-SVE-NEXT: ldr d1, [x0]
428 ; CHECK-SVE-NEXT: ldr d2, [x1]
429 ; CHECK-SVE-NEXT: movi v0.2d, #0x000000ffffffff
430 ; CHECK-SVE-NEXT: smull v1.2d, v1.2s, v2.2s
431 ; CHECK-SVE-NEXT: and v0.16b, v1.16b, v0.16b
432 ; CHECK-SVE-NEXT: ret
434 ; CHECK-GI-LABEL: amull_v2i32_v2i64:
435 ; CHECK-GI: // %bb.0:
436 ; CHECK-GI-NEXT: ldr d1, [x0]
437 ; CHECK-GI-NEXT: ldr d2, [x1]
438 ; CHECK-GI-NEXT: movi v0.2d, #0x000000ffffffff
439 ; CHECK-GI-NEXT: umull v1.2d, v1.2s, v2.2s
440 ; CHECK-GI-NEXT: and v0.16b, v1.16b, v0.16b
442 %tmp1 = load <2 x i32>, ptr %A
443 %tmp2 = load <2 x i32>, ptr %B
444 %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
445 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
446 %tmp5 = mul <2 x i64> %tmp3, %tmp4
447 %and = and <2 x i64> %tmp5, <i64 4294967295, i64 4294967295>
451 define <8 x i16> @smlal_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
452 ; CHECK-LABEL: smlal_v8i8_v8i16:
454 ; CHECK-NEXT: ldr q0, [x0]
455 ; CHECK-NEXT: ldr d1, [x1]
456 ; CHECK-NEXT: ldr d2, [x2]
457 ; CHECK-NEXT: smlal v0.8h, v1.8b, v2.8b
459 %tmp1 = load <8 x i16>, ptr %A
460 %tmp2 = load <8 x i8>, ptr %B
461 %tmp3 = load <8 x i8>, ptr %C
462 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
463 %tmp5 = sext <8 x i8> %tmp3 to <8 x i16>
464 %tmp6 = mul <8 x i16> %tmp4, %tmp5
465 %tmp7 = add <8 x i16> %tmp1, %tmp6
469 define <4 x i32> @smlal_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
470 ; CHECK-LABEL: smlal_v4i16_v4i32:
472 ; CHECK-NEXT: ldr q0, [x0]
473 ; CHECK-NEXT: ldr d1, [x1]
474 ; CHECK-NEXT: ldr d2, [x2]
475 ; CHECK-NEXT: smlal v0.4s, v1.4h, v2.4h
477 %tmp1 = load <4 x i32>, ptr %A
478 %tmp2 = load <4 x i16>, ptr %B
479 %tmp3 = load <4 x i16>, ptr %C
480 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
481 %tmp5 = sext <4 x i16> %tmp3 to <4 x i32>
482 %tmp6 = mul <4 x i32> %tmp4, %tmp5
483 %tmp7 = add <4 x i32> %tmp1, %tmp6
487 define <2 x i64> @smlal_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
488 ; CHECK-LABEL: smlal_v2i32_v2i64:
490 ; CHECK-NEXT: ldr q0, [x0]
491 ; CHECK-NEXT: ldr d1, [x1]
492 ; CHECK-NEXT: ldr d2, [x2]
493 ; CHECK-NEXT: smlal v0.2d, v1.2s, v2.2s
495 %tmp1 = load <2 x i64>, ptr %A
496 %tmp2 = load <2 x i32>, ptr %B
497 %tmp3 = load <2 x i32>, ptr %C
498 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
499 %tmp5 = sext <2 x i32> %tmp3 to <2 x i64>
500 %tmp6 = mul <2 x i64> %tmp4, %tmp5
501 %tmp7 = add <2 x i64> %tmp1, %tmp6
505 define <8 x i16> @umlal_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
506 ; CHECK-LABEL: umlal_v8i8_v8i16:
508 ; CHECK-NEXT: ldr q0, [x0]
509 ; CHECK-NEXT: ldr d1, [x1]
510 ; CHECK-NEXT: ldr d2, [x2]
511 ; CHECK-NEXT: umlal v0.8h, v1.8b, v2.8b
513 %tmp1 = load <8 x i16>, ptr %A
514 %tmp2 = load <8 x i8>, ptr %B
515 %tmp3 = load <8 x i8>, ptr %C
516 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
517 %tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
518 %tmp6 = mul <8 x i16> %tmp4, %tmp5
519 %tmp7 = add <8 x i16> %tmp1, %tmp6
523 define <4 x i32> @umlal_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
524 ; CHECK-LABEL: umlal_v4i16_v4i32:
526 ; CHECK-NEXT: ldr q0, [x0]
527 ; CHECK-NEXT: ldr d1, [x1]
528 ; CHECK-NEXT: ldr d2, [x2]
529 ; CHECK-NEXT: umlal v0.4s, v1.4h, v2.4h
531 %tmp1 = load <4 x i32>, ptr %A
532 %tmp2 = load <4 x i16>, ptr %B
533 %tmp3 = load <4 x i16>, ptr %C
534 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
535 %tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
536 %tmp6 = mul <4 x i32> %tmp4, %tmp5
537 %tmp7 = add <4 x i32> %tmp1, %tmp6
541 define <2 x i64> @umlal_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
542 ; CHECK-LABEL: umlal_v2i32_v2i64:
544 ; CHECK-NEXT: ldr q0, [x0]
545 ; CHECK-NEXT: ldr d1, [x1]
546 ; CHECK-NEXT: ldr d2, [x2]
547 ; CHECK-NEXT: umlal v0.2d, v1.2s, v2.2s
549 %tmp1 = load <2 x i64>, ptr %A
550 %tmp2 = load <2 x i32>, ptr %B
551 %tmp3 = load <2 x i32>, ptr %C
552 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
553 %tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
554 %tmp6 = mul <2 x i64> %tmp4, %tmp5
555 %tmp7 = add <2 x i64> %tmp1, %tmp6
559 define <8 x i16> @amlal_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
560 ; CHECK-NEON-LABEL: amlal_v8i8_v8i16:
561 ; CHECK-NEON: // %bb.0:
562 ; CHECK-NEON-NEXT: ldr q0, [x0]
563 ; CHECK-NEON-NEXT: ldr d1, [x1]
564 ; CHECK-NEON-NEXT: ldr d2, [x2]
565 ; CHECK-NEON-NEXT: smlal v0.8h, v1.8b, v2.8b
566 ; CHECK-NEON-NEXT: bic v0.8h, #255, lsl #8
567 ; CHECK-NEON-NEXT: ret
569 ; CHECK-SVE-LABEL: amlal_v8i8_v8i16:
570 ; CHECK-SVE: // %bb.0:
571 ; CHECK-SVE-NEXT: ldr q0, [x0]
572 ; CHECK-SVE-NEXT: ldr d1, [x1]
573 ; CHECK-SVE-NEXT: ldr d2, [x2]
574 ; CHECK-SVE-NEXT: smlal v0.8h, v1.8b, v2.8b
575 ; CHECK-SVE-NEXT: bic v0.8h, #255, lsl #8
576 ; CHECK-SVE-NEXT: ret
578 ; CHECK-GI-LABEL: amlal_v8i8_v8i16:
579 ; CHECK-GI: // %bb.0:
580 ; CHECK-GI-NEXT: ldr q0, [x0]
581 ; CHECK-GI-NEXT: ldr d1, [x1]
582 ; CHECK-GI-NEXT: movi v3.2d, #0xff00ff00ff00ff
583 ; CHECK-GI-NEXT: ldr d2, [x2]
584 ; CHECK-GI-NEXT: umlal v0.8h, v1.8b, v2.8b
585 ; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b
587 %tmp1 = load <8 x i16>, ptr %A
588 %tmp2 = load <8 x i8>, ptr %B
589 %tmp3 = load <8 x i8>, ptr %C
590 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
591 %tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
592 %tmp6 = mul <8 x i16> %tmp4, %tmp5
593 %tmp7 = add <8 x i16> %tmp1, %tmp6
594 %and = and <8 x i16> %tmp7, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
598 define <4 x i32> @amlal_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
599 ; CHECK-NEON-LABEL: amlal_v4i16_v4i32:
600 ; CHECK-NEON: // %bb.0:
601 ; CHECK-NEON-NEXT: ldr q0, [x0]
602 ; CHECK-NEON-NEXT: ldr d1, [x1]
603 ; CHECK-NEON-NEXT: ldr d2, [x2]
604 ; CHECK-NEON-NEXT: smlal v0.4s, v1.4h, v2.4h
605 ; CHECK-NEON-NEXT: movi v1.2d, #0x00ffff0000ffff
606 ; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b
607 ; CHECK-NEON-NEXT: ret
609 ; CHECK-SVE-LABEL: amlal_v4i16_v4i32:
610 ; CHECK-SVE: // %bb.0:
611 ; CHECK-SVE-NEXT: ldr q0, [x0]
612 ; CHECK-SVE-NEXT: ldr d1, [x1]
613 ; CHECK-SVE-NEXT: ldr d2, [x2]
614 ; CHECK-SVE-NEXT: smlal v0.4s, v1.4h, v2.4h
615 ; CHECK-SVE-NEXT: movi v1.2d, #0x00ffff0000ffff
616 ; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b
617 ; CHECK-SVE-NEXT: ret
619 ; CHECK-GI-LABEL: amlal_v4i16_v4i32:
620 ; CHECK-GI: // %bb.0:
621 ; CHECK-GI-NEXT: ldr q0, [x0]
622 ; CHECK-GI-NEXT: ldr d1, [x1]
623 ; CHECK-GI-NEXT: movi v3.2d, #0x00ffff0000ffff
624 ; CHECK-GI-NEXT: ldr d2, [x2]
625 ; CHECK-GI-NEXT: umlal v0.4s, v1.4h, v2.4h
626 ; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b
628 %tmp1 = load <4 x i32>, ptr %A
629 %tmp2 = load <4 x i16>, ptr %B
630 %tmp3 = load <4 x i16>, ptr %C
631 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
632 %tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
633 %tmp6 = mul <4 x i32> %tmp4, %tmp5
634 %tmp7 = add <4 x i32> %tmp1, %tmp6
635 %and = and <4 x i32> %tmp7, <i32 65535, i32 65535, i32 65535, i32 65535>
639 define <2 x i64> @amlal_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
640 ; CHECK-NEON-LABEL: amlal_v2i32_v2i64:
641 ; CHECK-NEON: // %bb.0:
642 ; CHECK-NEON-NEXT: ldr q0, [x0]
643 ; CHECK-NEON-NEXT: ldr d1, [x1]
644 ; CHECK-NEON-NEXT: ldr d2, [x2]
645 ; CHECK-NEON-NEXT: smlal v0.2d, v1.2s, v2.2s
646 ; CHECK-NEON-NEXT: movi v1.2d, #0x000000ffffffff
647 ; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b
648 ; CHECK-NEON-NEXT: ret
650 ; CHECK-SVE-LABEL: amlal_v2i32_v2i64:
651 ; CHECK-SVE: // %bb.0:
652 ; CHECK-SVE-NEXT: ldr q0, [x0]
653 ; CHECK-SVE-NEXT: ldr d1, [x1]
654 ; CHECK-SVE-NEXT: ldr d2, [x2]
655 ; CHECK-SVE-NEXT: smlal v0.2d, v1.2s, v2.2s
656 ; CHECK-SVE-NEXT: movi v1.2d, #0x000000ffffffff
657 ; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b
658 ; CHECK-SVE-NEXT: ret
660 ; CHECK-GI-LABEL: amlal_v2i32_v2i64:
661 ; CHECK-GI: // %bb.0:
662 ; CHECK-GI-NEXT: ldr q0, [x0]
663 ; CHECK-GI-NEXT: ldr d1, [x1]
664 ; CHECK-GI-NEXT: movi v3.2d, #0x000000ffffffff
665 ; CHECK-GI-NEXT: ldr d2, [x2]
666 ; CHECK-GI-NEXT: umlal v0.2d, v1.2s, v2.2s
667 ; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b
669 %tmp1 = load <2 x i64>, ptr %A
670 %tmp2 = load <2 x i32>, ptr %B
671 %tmp3 = load <2 x i32>, ptr %C
672 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
673 %tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
674 %tmp6 = mul <2 x i64> %tmp4, %tmp5
675 %tmp7 = add <2 x i64> %tmp1, %tmp6
676 %and = and <2 x i64> %tmp7, <i64 4294967295, i64 4294967295>
680 define <8 x i16> @smlsl_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
681 ; CHECK-LABEL: smlsl_v8i8_v8i16:
683 ; CHECK-NEXT: ldr q0, [x0]
684 ; CHECK-NEXT: ldr d1, [x1]
685 ; CHECK-NEXT: ldr d2, [x2]
686 ; CHECK-NEXT: smlsl v0.8h, v1.8b, v2.8b
688 %tmp1 = load <8 x i16>, ptr %A
689 %tmp2 = load <8 x i8>, ptr %B
690 %tmp3 = load <8 x i8>, ptr %C
691 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
692 %tmp5 = sext <8 x i8> %tmp3 to <8 x i16>
693 %tmp6 = mul <8 x i16> %tmp4, %tmp5
694 %tmp7 = sub <8 x i16> %tmp1, %tmp6
698 define <4 x i32> @smlsl_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
699 ; CHECK-LABEL: smlsl_v4i16_v4i32:
701 ; CHECK-NEXT: ldr q0, [x0]
702 ; CHECK-NEXT: ldr d1, [x1]
703 ; CHECK-NEXT: ldr d2, [x2]
704 ; CHECK-NEXT: smlsl v0.4s, v1.4h, v2.4h
706 %tmp1 = load <4 x i32>, ptr %A
707 %tmp2 = load <4 x i16>, ptr %B
708 %tmp3 = load <4 x i16>, ptr %C
709 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
710 %tmp5 = sext <4 x i16> %tmp3 to <4 x i32>
711 %tmp6 = mul <4 x i32> %tmp4, %tmp5
712 %tmp7 = sub <4 x i32> %tmp1, %tmp6
716 define <2 x i64> @smlsl_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
717 ; CHECK-LABEL: smlsl_v2i32_v2i64:
719 ; CHECK-NEXT: ldr q0, [x0]
720 ; CHECK-NEXT: ldr d1, [x1]
721 ; CHECK-NEXT: ldr d2, [x2]
722 ; CHECK-NEXT: smlsl v0.2d, v1.2s, v2.2s
724 %tmp1 = load <2 x i64>, ptr %A
725 %tmp2 = load <2 x i32>, ptr %B
726 %tmp3 = load <2 x i32>, ptr %C
727 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
728 %tmp5 = sext <2 x i32> %tmp3 to <2 x i64>
729 %tmp6 = mul <2 x i64> %tmp4, %tmp5
730 %tmp7 = sub <2 x i64> %tmp1, %tmp6
734 define <8 x i16> @umlsl_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
735 ; CHECK-LABEL: umlsl_v8i8_v8i16:
737 ; CHECK-NEXT: ldr q0, [x0]
738 ; CHECK-NEXT: ldr d1, [x1]
739 ; CHECK-NEXT: ldr d2, [x2]
740 ; CHECK-NEXT: umlsl v0.8h, v1.8b, v2.8b
742 %tmp1 = load <8 x i16>, ptr %A
743 %tmp2 = load <8 x i8>, ptr %B
744 %tmp3 = load <8 x i8>, ptr %C
745 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
746 %tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
747 %tmp6 = mul <8 x i16> %tmp4, %tmp5
748 %tmp7 = sub <8 x i16> %tmp1, %tmp6
752 define <4 x i32> @umlsl_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
753 ; CHECK-LABEL: umlsl_v4i16_v4i32:
755 ; CHECK-NEXT: ldr q0, [x0]
756 ; CHECK-NEXT: ldr d1, [x1]
757 ; CHECK-NEXT: ldr d2, [x2]
758 ; CHECK-NEXT: umlsl v0.4s, v1.4h, v2.4h
760 %tmp1 = load <4 x i32>, ptr %A
761 %tmp2 = load <4 x i16>, ptr %B
762 %tmp3 = load <4 x i16>, ptr %C
763 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
764 %tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
765 %tmp6 = mul <4 x i32> %tmp4, %tmp5
766 %tmp7 = sub <4 x i32> %tmp1, %tmp6
770 define <2 x i64> @umlsl_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
771 ; CHECK-LABEL: umlsl_v2i32_v2i64:
773 ; CHECK-NEXT: ldr q0, [x0]
774 ; CHECK-NEXT: ldr d1, [x1]
775 ; CHECK-NEXT: ldr d2, [x2]
776 ; CHECK-NEXT: umlsl v0.2d, v1.2s, v2.2s
778 %tmp1 = load <2 x i64>, ptr %A
779 %tmp2 = load <2 x i32>, ptr %B
780 %tmp3 = load <2 x i32>, ptr %C
781 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
782 %tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
783 %tmp6 = mul <2 x i64> %tmp4, %tmp5
784 %tmp7 = sub <2 x i64> %tmp1, %tmp6
788 define <8 x i16> @amlsl_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
789 ; CHECK-NEON-LABEL: amlsl_v8i8_v8i16:
790 ; CHECK-NEON: // %bb.0:
791 ; CHECK-NEON-NEXT: ldr q0, [x0]
792 ; CHECK-NEON-NEXT: ldr d1, [x1]
793 ; CHECK-NEON-NEXT: ldr d2, [x2]
794 ; CHECK-NEON-NEXT: smlsl v0.8h, v1.8b, v2.8b
795 ; CHECK-NEON-NEXT: bic v0.8h, #255, lsl #8
796 ; CHECK-NEON-NEXT: ret
798 ; CHECK-SVE-LABEL: amlsl_v8i8_v8i16:
799 ; CHECK-SVE: // %bb.0:
800 ; CHECK-SVE-NEXT: ldr q0, [x0]
801 ; CHECK-SVE-NEXT: ldr d1, [x1]
802 ; CHECK-SVE-NEXT: ldr d2, [x2]
803 ; CHECK-SVE-NEXT: smlsl v0.8h, v1.8b, v2.8b
804 ; CHECK-SVE-NEXT: bic v0.8h, #255, lsl #8
805 ; CHECK-SVE-NEXT: ret
807 ; CHECK-GI-LABEL: amlsl_v8i8_v8i16:
808 ; CHECK-GI: // %bb.0:
809 ; CHECK-GI-NEXT: ldr q0, [x0]
810 ; CHECK-GI-NEXT: ldr d1, [x1]
811 ; CHECK-GI-NEXT: movi v3.2d, #0xff00ff00ff00ff
812 ; CHECK-GI-NEXT: ldr d2, [x2]
813 ; CHECK-GI-NEXT: umlsl v0.8h, v1.8b, v2.8b
814 ; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b
816 %tmp1 = load <8 x i16>, ptr %A
817 %tmp2 = load <8 x i8>, ptr %B
818 %tmp3 = load <8 x i8>, ptr %C
819 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
820 %tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
821 %tmp6 = mul <8 x i16> %tmp4, %tmp5
822 %tmp7 = sub <8 x i16> %tmp1, %tmp6
823 %and = and <8 x i16> %tmp7, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
827 define <4 x i32> @amlsl_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
828 ; CHECK-NEON-LABEL: amlsl_v4i16_v4i32:
829 ; CHECK-NEON: // %bb.0:
830 ; CHECK-NEON-NEXT: ldr q0, [x0]
831 ; CHECK-NEON-NEXT: ldr d1, [x1]
832 ; CHECK-NEON-NEXT: ldr d2, [x2]
833 ; CHECK-NEON-NEXT: smlsl v0.4s, v1.4h, v2.4h
834 ; CHECK-NEON-NEXT: movi v1.2d, #0x00ffff0000ffff
835 ; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b
836 ; CHECK-NEON-NEXT: ret
838 ; CHECK-SVE-LABEL: amlsl_v4i16_v4i32:
839 ; CHECK-SVE: // %bb.0:
840 ; CHECK-SVE-NEXT: ldr q0, [x0]
841 ; CHECK-SVE-NEXT: ldr d1, [x1]
842 ; CHECK-SVE-NEXT: ldr d2, [x2]
843 ; CHECK-SVE-NEXT: smlsl v0.4s, v1.4h, v2.4h
844 ; CHECK-SVE-NEXT: movi v1.2d, #0x00ffff0000ffff
845 ; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b
846 ; CHECK-SVE-NEXT: ret
848 ; CHECK-GI-LABEL: amlsl_v4i16_v4i32:
849 ; CHECK-GI: // %bb.0:
850 ; CHECK-GI-NEXT: ldr q0, [x0]
851 ; CHECK-GI-NEXT: ldr d1, [x1]
852 ; CHECK-GI-NEXT: movi v3.2d, #0x00ffff0000ffff
853 ; CHECK-GI-NEXT: ldr d2, [x2]
854 ; CHECK-GI-NEXT: umlsl v0.4s, v1.4h, v2.4h
855 ; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b
857 %tmp1 = load <4 x i32>, ptr %A
858 %tmp2 = load <4 x i16>, ptr %B
859 %tmp3 = load <4 x i16>, ptr %C
860 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
861 %tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
862 %tmp6 = mul <4 x i32> %tmp4, %tmp5
863 %tmp7 = sub <4 x i32> %tmp1, %tmp6
864 %and = and <4 x i32> %tmp7, <i32 65535, i32 65535, i32 65535, i32 65535>
868 define <2 x i64> @amlsl_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
869 ; CHECK-NEON-LABEL: amlsl_v2i32_v2i64:
870 ; CHECK-NEON: // %bb.0:
871 ; CHECK-NEON-NEXT: ldr q0, [x0]
872 ; CHECK-NEON-NEXT: ldr d1, [x1]
873 ; CHECK-NEON-NEXT: ldr d2, [x2]
874 ; CHECK-NEON-NEXT: smlsl v0.2d, v1.2s, v2.2s
875 ; CHECK-NEON-NEXT: movi v1.2d, #0x000000ffffffff
876 ; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b
877 ; CHECK-NEON-NEXT: ret
879 ; CHECK-SVE-LABEL: amlsl_v2i32_v2i64:
880 ; CHECK-SVE: // %bb.0:
881 ; CHECK-SVE-NEXT: ldr q0, [x0]
882 ; CHECK-SVE-NEXT: ldr d1, [x1]
883 ; CHECK-SVE-NEXT: ldr d2, [x2]
884 ; CHECK-SVE-NEXT: smlsl v0.2d, v1.2s, v2.2s
885 ; CHECK-SVE-NEXT: movi v1.2d, #0x000000ffffffff
886 ; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b
887 ; CHECK-SVE-NEXT: ret
889 ; CHECK-GI-LABEL: amlsl_v2i32_v2i64:
890 ; CHECK-GI: // %bb.0:
891 ; CHECK-GI-NEXT: ldr q0, [x0]
892 ; CHECK-GI-NEXT: ldr d1, [x1]
893 ; CHECK-GI-NEXT: movi v3.2d, #0x000000ffffffff
894 ; CHECK-GI-NEXT: ldr d2, [x2]
895 ; CHECK-GI-NEXT: umlsl v0.2d, v1.2s, v2.2s
896 ; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b
898 %tmp1 = load <2 x i64>, ptr %A
899 %tmp2 = load <2 x i32>, ptr %B
900 %tmp3 = load <2 x i32>, ptr %C
901 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
902 %tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
903 %tmp6 = mul <2 x i64> %tmp4, %tmp5
904 %tmp7 = sub <2 x i64> %tmp1, %tmp6
905 %and = and <2 x i64> %tmp7, <i64 4294967295, i64 4294967295>
909 ; SMULL recognizing BUILD_VECTORs with sign/zero-extended elements.
910 define <8 x i16> @smull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
911 ; CHECK-NEON-LABEL: smull_extvec_v8i8_v8i16:
912 ; CHECK-NEON: // %bb.0:
913 ; CHECK-NEON-NEXT: movi v1.8b, #244
914 ; CHECK-NEON-NEXT: smull v0.8h, v0.8b, v1.8b
915 ; CHECK-NEON-NEXT: ret
917 ; CHECK-SVE-LABEL: smull_extvec_v8i8_v8i16:
918 ; CHECK-SVE: // %bb.0:
919 ; CHECK-SVE-NEXT: movi v1.8b, #244
920 ; CHECK-SVE-NEXT: smull v0.8h, v0.8b, v1.8b
921 ; CHECK-SVE-NEXT: ret
923 ; CHECK-GI-LABEL: smull_extvec_v8i8_v8i16:
924 ; CHECK-GI: // %bb.0:
925 ; CHECK-GI-NEXT: mvni v1.8h, #11
926 ; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0
927 ; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h
929 %tmp3 = sext <8 x i8> %arg to <8 x i16>
930 %tmp4 = mul <8 x i16> %tmp3, <i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12>
934 define <8 x i16> @smull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
935 ; Do not use SMULL if the BUILD_VECTOR element values are too big.
936 ; CHECK-NEON-LABEL: smull_noextvec_v8i8_v8i16:
937 ; CHECK-NEON: // %bb.0:
938 ; CHECK-NEON-NEXT: mov w8, #64537 // =0xfc19
939 ; CHECK-NEON-NEXT: sshll v0.8h, v0.8b, #0
940 ; CHECK-NEON-NEXT: dup v1.8h, w8
941 ; CHECK-NEON-NEXT: mul v0.8h, v0.8h, v1.8h
942 ; CHECK-NEON-NEXT: ret
944 ; CHECK-SVE-LABEL: smull_noextvec_v8i8_v8i16:
945 ; CHECK-SVE: // %bb.0:
946 ; CHECK-SVE-NEXT: mov w8, #64537 // =0xfc19
947 ; CHECK-SVE-NEXT: sshll v0.8h, v0.8b, #0
948 ; CHECK-SVE-NEXT: dup v1.8h, w8
949 ; CHECK-SVE-NEXT: mul v0.8h, v0.8h, v1.8h
950 ; CHECK-SVE-NEXT: ret
952 ; CHECK-GI-LABEL: smull_noextvec_v8i8_v8i16:
953 ; CHECK-GI: // %bb.0:
954 ; CHECK-GI-NEXT: adrp x8, .LCPI34_0
955 ; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0
956 ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI34_0]
957 ; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h
959 %tmp3 = sext <8 x i8> %arg to <8 x i16>
960 %tmp4 = mul <8 x i16> %tmp3, <i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999>
964 define <4 x i32> @smull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
965 ; CHECK-NEON-LABEL: smull_extvec_v4i16_v4i32:
966 ; CHECK-NEON: // %bb.0:
967 ; CHECK-NEON-NEXT: mvni v1.4h, #11
968 ; CHECK-NEON-NEXT: smull v0.4s, v0.4h, v1.4h
969 ; CHECK-NEON-NEXT: ret
971 ; CHECK-SVE-LABEL: smull_extvec_v4i16_v4i32:
972 ; CHECK-SVE: // %bb.0:
973 ; CHECK-SVE-NEXT: mvni v1.4h, #11
974 ; CHECK-SVE-NEXT: smull v0.4s, v0.4h, v1.4h
975 ; CHECK-SVE-NEXT: ret
977 ; CHECK-GI-LABEL: smull_extvec_v4i16_v4i32:
978 ; CHECK-GI: // %bb.0:
979 ; CHECK-GI-NEXT: mvni v1.4s, #11
980 ; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0
981 ; CHECK-GI-NEXT: mul v0.4s, v0.4s, v1.4s
983 %tmp3 = sext <4 x i16> %arg to <4 x i32>
984 %tmp4 = mul <4 x i32> %tmp3, <i32 -12, i32 -12, i32 -12, i32 -12>
988 define <2 x i64> @smull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
989 ; CHECK-NEON-LABEL: smull_extvec_v2i32_v2i64:
990 ; CHECK-NEON: // %bb.0:
991 ; CHECK-NEON-NEXT: mov w8, #-1234 // =0xfffffb2e
992 ; CHECK-NEON-NEXT: dup v1.2s, w8
993 ; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s
994 ; CHECK-NEON-NEXT: ret
996 ; CHECK-SVE-LABEL: smull_extvec_v2i32_v2i64:
997 ; CHECK-SVE: // %bb.0:
998 ; CHECK-SVE-NEXT: mov w8, #-1234 // =0xfffffb2e
999 ; CHECK-SVE-NEXT: dup v1.2s, w8
1000 ; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s
1001 ; CHECK-SVE-NEXT: ret
1003 ; CHECK-GI-LABEL: smull_extvec_v2i32_v2i64:
1004 ; CHECK-GI: // %bb.0:
1005 ; CHECK-GI-NEXT: adrp x8, .LCPI36_0
1006 ; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #0
1007 ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI36_0]
1008 ; CHECK-GI-NEXT: mov d2, v0.d[1]
1009 ; CHECK-GI-NEXT: mov d3, v1.d[1]
1010 ; CHECK-GI-NEXT: fmov x8, d0
1011 ; CHECK-GI-NEXT: fmov x9, d1
1012 ; CHECK-GI-NEXT: mul x8, x8, x9
1013 ; CHECK-GI-NEXT: fmov x9, d2
1014 ; CHECK-GI-NEXT: fmov x10, d3
1015 ; CHECK-GI-NEXT: mul x9, x9, x10
1016 ; CHECK-GI-NEXT: fmov d0, x8
1017 ; CHECK-GI-NEXT: mov v0.d[1], x9
1018 ; CHECK-GI-NEXT: ret
1019 %tmp3 = sext <2 x i32> %arg to <2 x i64>
1020 %tmp4 = mul <2 x i64> %tmp3, <i64 -1234, i64 -1234>
1024 define <8 x i16> @umull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
1025 ; CHECK-NEON-LABEL: umull_extvec_v8i8_v8i16:
1026 ; CHECK-NEON: // %bb.0:
1027 ; CHECK-NEON-NEXT: movi v1.8b, #12
1028 ; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v1.8b
1029 ; CHECK-NEON-NEXT: ret
1031 ; CHECK-SVE-LABEL: umull_extvec_v8i8_v8i16:
1032 ; CHECK-SVE: // %bb.0:
1033 ; CHECK-SVE-NEXT: movi v1.8b, #12
1034 ; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v1.8b
1035 ; CHECK-SVE-NEXT: ret
1037 ; CHECK-GI-LABEL: umull_extvec_v8i8_v8i16:
1038 ; CHECK-GI: // %bb.0:
1039 ; CHECK-GI-NEXT: movi v1.8h, #12
1040 ; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
1041 ; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h
1042 ; CHECK-GI-NEXT: ret
1043 %tmp3 = zext <8 x i8> %arg to <8 x i16>
1044 %tmp4 = mul <8 x i16> %tmp3, <i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12>
1048 define <8 x i16> @umull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
1049 ; Do not use SMULL if the BUILD_VECTOR element values are too big.
1050 ; CHECK-NEON-LABEL: umull_noextvec_v8i8_v8i16:
1051 ; CHECK-NEON: // %bb.0:
1052 ; CHECK-NEON-NEXT: mov w8, #999 // =0x3e7
1053 ; CHECK-NEON-NEXT: ushll v0.8h, v0.8b, #0
1054 ; CHECK-NEON-NEXT: dup v1.8h, w8
1055 ; CHECK-NEON-NEXT: mul v0.8h, v0.8h, v1.8h
1056 ; CHECK-NEON-NEXT: ret
1058 ; CHECK-SVE-LABEL: umull_noextvec_v8i8_v8i16:
1059 ; CHECK-SVE: // %bb.0:
1060 ; CHECK-SVE-NEXT: mov w8, #999 // =0x3e7
1061 ; CHECK-SVE-NEXT: ushll v0.8h, v0.8b, #0
1062 ; CHECK-SVE-NEXT: dup v1.8h, w8
1063 ; CHECK-SVE-NEXT: mul v0.8h, v0.8h, v1.8h
1064 ; CHECK-SVE-NEXT: ret
1066 ; CHECK-GI-LABEL: umull_noextvec_v8i8_v8i16:
1067 ; CHECK-GI: // %bb.0:
1068 ; CHECK-GI-NEXT: adrp x8, .LCPI38_0
1069 ; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
1070 ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI38_0]
1071 ; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h
1072 ; CHECK-GI-NEXT: ret
1073 %tmp3 = zext <8 x i8> %arg to <8 x i16>
1074 %tmp4 = mul <8 x i16> %tmp3, <i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999>
1078 define <4 x i32> @umull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
1079 ; CHECK-NEON-LABEL: umull_extvec_v4i16_v4i32:
1080 ; CHECK-NEON: // %bb.0:
1081 ; CHECK-NEON-NEXT: mov w8, #1234 // =0x4d2
1082 ; CHECK-NEON-NEXT: dup v1.4h, w8
1083 ; CHECK-NEON-NEXT: umull v0.4s, v0.4h, v1.4h
1084 ; CHECK-NEON-NEXT: ret
1086 ; CHECK-SVE-LABEL: umull_extvec_v4i16_v4i32:
1087 ; CHECK-SVE: // %bb.0:
1088 ; CHECK-SVE-NEXT: mov w8, #1234 // =0x4d2
1089 ; CHECK-SVE-NEXT: dup v1.4h, w8
1090 ; CHECK-SVE-NEXT: umull v0.4s, v0.4h, v1.4h
1091 ; CHECK-SVE-NEXT: ret
1093 ; CHECK-GI-LABEL: umull_extvec_v4i16_v4i32:
1094 ; CHECK-GI: // %bb.0:
1095 ; CHECK-GI-NEXT: adrp x8, .LCPI39_0
1096 ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
1097 ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI39_0]
1098 ; CHECK-GI-NEXT: mul v0.4s, v0.4s, v1.4s
1099 ; CHECK-GI-NEXT: ret
1100 %tmp3 = zext <4 x i16> %arg to <4 x i32>
1101 %tmp4 = mul <4 x i32> %tmp3, <i32 1234, i32 1234, i32 1234, i32 1234>
1105 define <2 x i64> @umull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
1106 ; CHECK-NEON-LABEL: umull_extvec_v2i32_v2i64:
1107 ; CHECK-NEON: // %bb.0:
1108 ; CHECK-NEON-NEXT: mov w8, #1234 // =0x4d2
1109 ; CHECK-NEON-NEXT: dup v1.2s, w8
1110 ; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v1.2s
1111 ; CHECK-NEON-NEXT: ret
1113 ; CHECK-SVE-LABEL: umull_extvec_v2i32_v2i64:
1114 ; CHECK-SVE: // %bb.0:
1115 ; CHECK-SVE-NEXT: mov w8, #1234 // =0x4d2
1116 ; CHECK-SVE-NEXT: dup v1.2s, w8
1117 ; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v1.2s
1118 ; CHECK-SVE-NEXT: ret
1120 ; CHECK-GI-LABEL: umull_extvec_v2i32_v2i64:
1121 ; CHECK-GI: // %bb.0:
1122 ; CHECK-GI-NEXT: adrp x8, .LCPI40_0
1123 ; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
1124 ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI40_0]
1125 ; CHECK-GI-NEXT: mov d2, v0.d[1]
1126 ; CHECK-GI-NEXT: mov d3, v1.d[1]
1127 ; CHECK-GI-NEXT: fmov x8, d0
1128 ; CHECK-GI-NEXT: fmov x9, d1
1129 ; CHECK-GI-NEXT: mul x8, x8, x9
1130 ; CHECK-GI-NEXT: fmov x9, d2
1131 ; CHECK-GI-NEXT: fmov x10, d3
1132 ; CHECK-GI-NEXT: mul x9, x9, x10
1133 ; CHECK-GI-NEXT: fmov d0, x8
1134 ; CHECK-GI-NEXT: mov v0.d[1], x9
1135 ; CHECK-GI-NEXT: ret
1136 %tmp3 = zext <2 x i32> %arg to <2 x i64>
1137 %tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234>
1141 define <8 x i16> @amull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
1142 ; CHECK-NEON-LABEL: amull_extvec_v8i8_v8i16:
1143 ; CHECK-NEON: // %bb.0:
1144 ; CHECK-NEON-NEXT: movi v1.8b, #12
1145 ; CHECK-NEON-NEXT: smull v0.8h, v0.8b, v1.8b
1146 ; CHECK-NEON-NEXT: bic v0.8h, #255, lsl #8
1147 ; CHECK-NEON-NEXT: ret
1149 ; CHECK-SVE-LABEL: amull_extvec_v8i8_v8i16:
1150 ; CHECK-SVE: // %bb.0:
1151 ; CHECK-SVE-NEXT: movi v1.8b, #12
1152 ; CHECK-SVE-NEXT: smull v0.8h, v0.8b, v1.8b
1153 ; CHECK-SVE-NEXT: bic v0.8h, #255, lsl #8
1154 ; CHECK-SVE-NEXT: ret
1156 ; CHECK-GI-LABEL: amull_extvec_v8i8_v8i16:
1157 ; CHECK-GI: // %bb.0:
1158 ; CHECK-GI-NEXT: movi v1.8h, #12
1159 ; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
1160 ; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff
1161 ; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h
1162 ; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b
1163 ; CHECK-GI-NEXT: ret
1164 %tmp3 = zext <8 x i8> %arg to <8 x i16>
1165 %tmp4 = mul <8 x i16> %tmp3, <i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12>
1166 %and = and <8 x i16> %tmp4, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
1170 define <4 x i32> @amull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
1171 ; CHECK-NEON-LABEL: amull_extvec_v4i16_v4i32:
1172 ; CHECK-NEON: // %bb.0:
1173 ; CHECK-NEON-NEXT: mov w8, #1234 // =0x4d2
1174 ; CHECK-NEON-NEXT: dup v1.4h, w8
1175 ; CHECK-NEON-NEXT: smull v0.4s, v0.4h, v1.4h
1176 ; CHECK-NEON-NEXT: movi v1.2d, #0x00ffff0000ffff
1177 ; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b
1178 ; CHECK-NEON-NEXT: ret
1180 ; CHECK-SVE-LABEL: amull_extvec_v4i16_v4i32:
1181 ; CHECK-SVE: // %bb.0:
1182 ; CHECK-SVE-NEXT: mov w8, #1234 // =0x4d2
1183 ; CHECK-SVE-NEXT: dup v1.4h, w8
1184 ; CHECK-SVE-NEXT: smull v0.4s, v0.4h, v1.4h
1185 ; CHECK-SVE-NEXT: movi v1.2d, #0x00ffff0000ffff
1186 ; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b
1187 ; CHECK-SVE-NEXT: ret
1189 ; CHECK-GI-LABEL: amull_extvec_v4i16_v4i32:
1190 ; CHECK-GI: // %bb.0:
1191 ; CHECK-GI-NEXT: adrp x8, .LCPI42_0
1192 ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
1193 ; CHECK-GI-NEXT: movi v2.2d, #0x00ffff0000ffff
1194 ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI42_0]
1195 ; CHECK-GI-NEXT: mul v0.4s, v0.4s, v1.4s
1196 ; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b
1197 ; CHECK-GI-NEXT: ret
1198 %tmp3 = zext <4 x i16> %arg to <4 x i32>
1199 %tmp4 = mul <4 x i32> %tmp3, <i32 1234, i32 1234, i32 1234, i32 1234>
1200 %and = and <4 x i32> %tmp4, <i32 65535, i32 65535, i32 65535, i32 65535>
1204 define <2 x i64> @amull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
1205 ; CHECK-NEON-LABEL: amull_extvec_v2i32_v2i64:
1206 ; CHECK-NEON: // %bb.0:
1207 ; CHECK-NEON-NEXT: mov w8, #1234 // =0x4d2
1208 ; CHECK-NEON-NEXT: dup v1.2s, w8
1209 ; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s
1210 ; CHECK-NEON-NEXT: movi v1.2d, #0x000000ffffffff
1211 ; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b
1212 ; CHECK-NEON-NEXT: ret
1214 ; CHECK-SVE-LABEL: amull_extvec_v2i32_v2i64:
1215 ; CHECK-SVE: // %bb.0:
1216 ; CHECK-SVE-NEXT: mov w8, #1234 // =0x4d2
1217 ; CHECK-SVE-NEXT: dup v1.2s, w8
1218 ; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s
1219 ; CHECK-SVE-NEXT: movi v1.2d, #0x000000ffffffff
1220 ; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b
1221 ; CHECK-SVE-NEXT: ret
1223 ; CHECK-GI-LABEL: amull_extvec_v2i32_v2i64:
1224 ; CHECK-GI: // %bb.0:
1225 ; CHECK-GI-NEXT: adrp x8, .LCPI43_0
1226 ; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
1227 ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI43_0]
1228 ; CHECK-GI-NEXT: mov d2, v0.d[1]
1229 ; CHECK-GI-NEXT: mov d3, v1.d[1]
1230 ; CHECK-GI-NEXT: fmov x8, d0
1231 ; CHECK-GI-NEXT: fmov x9, d1
1232 ; CHECK-GI-NEXT: movi v1.2d, #0x000000ffffffff
1233 ; CHECK-GI-NEXT: mul x8, x8, x9
1234 ; CHECK-GI-NEXT: fmov x9, d2
1235 ; CHECK-GI-NEXT: fmov x10, d3
1236 ; CHECK-GI-NEXT: mul x9, x9, x10
1237 ; CHECK-GI-NEXT: fmov d0, x8
1238 ; CHECK-GI-NEXT: mov v0.d[1], x9
1239 ; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
1240 ; CHECK-GI-NEXT: ret
1241 %tmp3 = zext <2 x i32> %arg to <2 x i64>
1242 %tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234>
1243 %and = and <2 x i64> %tmp4, <i64 4294967295, i64 4294967295>
1247 define i16 @smullWithInconsistentExtensions(<8 x i8> %x, <8 x i8> %y) {
1248 ; If one operand has a zero-extend and the other a sign-extend, smull
1250 ; CHECK-LABEL: smullWithInconsistentExtensions:
1252 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0
1253 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0
1254 ; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h
1255 ; CHECK-NEXT: umov w0, v0.h[0]
1257 %s = sext <8 x i8> %x to <8 x i16>
1258 %z = zext <8 x i8> %y to <8 x i16>
1259 %m = mul <8 x i16> %s, %z
1260 %r = extractelement <8 x i16> %m, i32 0
1264 define <8 x i16> @smull_extended_vector_operand(<8 x i16> %v) {
1265 ; CHECK-LABEL: smull_extended_vector_operand:
1266 ; CHECK: // %bb.0: // %entry
1267 ; CHECK-NEXT: movi v1.4s, #139, lsl #8
1268 ; CHECK-NEXT: sshll v2.4s, v0.4h, #0
1269 ; CHECK-NEXT: sshll2 v0.4s, v0.8h, #0
1270 ; CHECK-NEXT: mul v2.4s, v2.4s, v1.4s
1271 ; CHECK-NEXT: mul v1.4s, v0.4s, v1.4s
1272 ; CHECK-NEXT: shrn v0.4h, v2.4s, #1
1273 ; CHECK-NEXT: shrn2 v0.8h, v1.4s, #1
1276 %0 = sext <8 x i16> %v to <8 x i32>
1277 %1 = mul <8 x i32> %0, <i32 35584, i32 35584, i32 35584, i32 35584, i32 35584, i32 35584, i32 35584, i32 35584>
1278 %2 = lshr <8 x i32> %1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1279 %3 = trunc <8 x i32> %2 to <8 x i16>
1284 define void @distribute(ptr %dst, ptr %src, i32 %mul) nounwind {
1285 ; CHECK-NEON-LABEL: distribute:
1286 ; CHECK-NEON: // %bb.0: // %entry
1287 ; CHECK-NEON-NEXT: ldr q0, [x1]
1288 ; CHECK-NEON-NEXT: dup v1.8b, w2
1289 ; CHECK-NEON-NEXT: mov d2, v0.d[1]
1290 ; CHECK-NEON-NEXT: umull v2.8h, v2.8b, v1.8b
1291 ; CHECK-NEON-NEXT: umlal v2.8h, v0.8b, v1.8b
1292 ; CHECK-NEON-NEXT: str q2, [x0]
1293 ; CHECK-NEON-NEXT: ret
1295 ; CHECK-SVE-LABEL: distribute:
1296 ; CHECK-SVE: // %bb.0: // %entry
1297 ; CHECK-SVE-NEXT: ldr q0, [x1]
1298 ; CHECK-SVE-NEXT: dup v1.8b, w2
1299 ; CHECK-SVE-NEXT: mov d2, v0.d[1]
1300 ; CHECK-SVE-NEXT: umull v2.8h, v2.8b, v1.8b
1301 ; CHECK-SVE-NEXT: umlal v2.8h, v0.8b, v1.8b
1302 ; CHECK-SVE-NEXT: str q2, [x0]
1303 ; CHECK-SVE-NEXT: ret
1305 ; CHECK-GI-LABEL: distribute:
1306 ; CHECK-GI: // %bb.0: // %entry
1307 ; CHECK-GI-NEXT: ldr q0, [x1]
1308 ; CHECK-GI-NEXT: dup v1.8b, w2
1309 ; CHECK-GI-NEXT: ushll v2.8h, v0.8b, #0
1310 ; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0
1311 ; CHECK-GI-NEXT: uaddw2 v0.8h, v2.8h, v0.16b
1312 ; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h
1313 ; CHECK-GI-NEXT: str q0, [x0]
1314 ; CHECK-GI-NEXT: ret
1316 %0 = trunc i32 %mul to i8
1317 %1 = insertelement <8 x i8> undef, i8 %0, i32 0
1318 %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
1319 %3 = load <16 x i8>, ptr %src, align 1
1320 %4 = bitcast <16 x i8> %3 to <2 x double>
1321 %5 = extractelement <2 x double> %4, i32 1
1322 %6 = bitcast double %5 to <8 x i8>
1323 %7 = zext <8 x i8> %6 to <8 x i16>
1324 %8 = zext <8 x i8> %2 to <8 x i16>
1325 %9 = extractelement <2 x double> %4, i32 0
1326 %10 = bitcast double %9 to <8 x i8>
1327 %11 = zext <8 x i8> %10 to <8 x i16>
1328 %12 = add <8 x i16> %7, %11
1329 %13 = mul <8 x i16> %12, %8
1330 store <8 x i16> %13, ptr %dst, align 2
1334 define <16 x i16> @umull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
1335 ; CHECK-NEON-LABEL: umull2_i8:
1336 ; CHECK-NEON: // %bb.0:
1337 ; CHECK-NEON-NEXT: umull2 v2.8h, v0.16b, v1.16b
1338 ; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v1.8b
1339 ; CHECK-NEON-NEXT: mov v1.16b, v2.16b
1340 ; CHECK-NEON-NEXT: ret
1342 ; CHECK-SVE-LABEL: umull2_i8:
1343 ; CHECK-SVE: // %bb.0:
1344 ; CHECK-SVE-NEXT: umull2 v2.8h, v0.16b, v1.16b
1345 ; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v1.8b
1346 ; CHECK-SVE-NEXT: mov v1.16b, v2.16b
1347 ; CHECK-SVE-NEXT: ret
1349 ; CHECK-GI-LABEL: umull2_i8:
1350 ; CHECK-GI: // %bb.0:
1351 ; CHECK-GI-NEXT: umull v2.8h, v0.8b, v1.8b
1352 ; CHECK-GI-NEXT: umull2 v1.8h, v0.16b, v1.16b
1353 ; CHECK-GI-NEXT: mov v0.16b, v2.16b
1354 ; CHECK-GI-NEXT: ret
1355 %arg1_ext = zext <16 x i8> %arg1 to <16 x i16>
1356 %arg2_ext = zext <16 x i8> %arg2 to <16 x i16>
1357 %mul = mul <16 x i16> %arg1_ext, %arg2_ext
1361 define <16 x i16> @smull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
1362 ; CHECK-NEON-LABEL: smull2_i8:
1363 ; CHECK-NEON: // %bb.0:
1364 ; CHECK-NEON-NEXT: smull2 v2.8h, v0.16b, v1.16b
1365 ; CHECK-NEON-NEXT: smull v0.8h, v0.8b, v1.8b
1366 ; CHECK-NEON-NEXT: mov v1.16b, v2.16b
1367 ; CHECK-NEON-NEXT: ret
1369 ; CHECK-SVE-LABEL: smull2_i8:
1370 ; CHECK-SVE: // %bb.0:
1371 ; CHECK-SVE-NEXT: smull2 v2.8h, v0.16b, v1.16b
1372 ; CHECK-SVE-NEXT: smull v0.8h, v0.8b, v1.8b
1373 ; CHECK-SVE-NEXT: mov v1.16b, v2.16b
1374 ; CHECK-SVE-NEXT: ret
1376 ; CHECK-GI-LABEL: smull2_i8:
1377 ; CHECK-GI: // %bb.0:
1378 ; CHECK-GI-NEXT: smull v2.8h, v0.8b, v1.8b
1379 ; CHECK-GI-NEXT: smull2 v1.8h, v0.16b, v1.16b
1380 ; CHECK-GI-NEXT: mov v0.16b, v2.16b
1381 ; CHECK-GI-NEXT: ret
1382 %arg1_ext = sext <16 x i8> %arg1 to <16 x i16>
1383 %arg2_ext = sext <16 x i8> %arg2 to <16 x i16>
1384 %mul = mul <16 x i16> %arg1_ext, %arg2_ext
1388 define <8 x i32> @umull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
1389 ; CHECK-NEON-LABEL: umull2_i16:
1390 ; CHECK-NEON: // %bb.0:
1391 ; CHECK-NEON-NEXT: umull2 v2.4s, v0.8h, v1.8h
1392 ; CHECK-NEON-NEXT: umull v0.4s, v0.4h, v1.4h
1393 ; CHECK-NEON-NEXT: mov v1.16b, v2.16b
1394 ; CHECK-NEON-NEXT: ret
1396 ; CHECK-SVE-LABEL: umull2_i16:
1397 ; CHECK-SVE: // %bb.0:
1398 ; CHECK-SVE-NEXT: umull2 v2.4s, v0.8h, v1.8h
1399 ; CHECK-SVE-NEXT: umull v0.4s, v0.4h, v1.4h
1400 ; CHECK-SVE-NEXT: mov v1.16b, v2.16b
1401 ; CHECK-SVE-NEXT: ret
1403 ; CHECK-GI-LABEL: umull2_i16:
1404 ; CHECK-GI: // %bb.0:
1405 ; CHECK-GI-NEXT: umull v2.4s, v0.4h, v1.4h
1406 ; CHECK-GI-NEXT: umull2 v1.4s, v0.8h, v1.8h
1407 ; CHECK-GI-NEXT: mov v0.16b, v2.16b
1408 ; CHECK-GI-NEXT: ret
1409 %arg1_ext = zext <8 x i16> %arg1 to <8 x i32>
1410 %arg2_ext = zext <8 x i16> %arg2 to <8 x i32>
1411 %mul = mul <8 x i32> %arg1_ext, %arg2_ext
1415 define <8 x i32> @smull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
1416 ; CHECK-NEON-LABEL: smull2_i16:
1417 ; CHECK-NEON: // %bb.0:
1418 ; CHECK-NEON-NEXT: smull2 v2.4s, v0.8h, v1.8h
1419 ; CHECK-NEON-NEXT: smull v0.4s, v0.4h, v1.4h
1420 ; CHECK-NEON-NEXT: mov v1.16b, v2.16b
1421 ; CHECK-NEON-NEXT: ret
1423 ; CHECK-SVE-LABEL: smull2_i16:
1424 ; CHECK-SVE: // %bb.0:
1425 ; CHECK-SVE-NEXT: smull2 v2.4s, v0.8h, v1.8h
1426 ; CHECK-SVE-NEXT: smull v0.4s, v0.4h, v1.4h
1427 ; CHECK-SVE-NEXT: mov v1.16b, v2.16b
1428 ; CHECK-SVE-NEXT: ret
1430 ; CHECK-GI-LABEL: smull2_i16:
1431 ; CHECK-GI: // %bb.0:
1432 ; CHECK-GI-NEXT: smull v2.4s, v0.4h, v1.4h
1433 ; CHECK-GI-NEXT: smull2 v1.4s, v0.8h, v1.8h
1434 ; CHECK-GI-NEXT: mov v0.16b, v2.16b
1435 ; CHECK-GI-NEXT: ret
1436 %arg1_ext = sext <8 x i16> %arg1 to <8 x i32>
1437 %arg2_ext = sext <8 x i16> %arg2 to <8 x i32>
1438 %mul = mul <8 x i32> %arg1_ext, %arg2_ext
1442 define <4 x i64> @umull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
1443 ; CHECK-NEON-LABEL: umull2_i32:
1444 ; CHECK-NEON: // %bb.0:
1445 ; CHECK-NEON-NEXT: umull2 v2.2d, v0.4s, v1.4s
1446 ; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v1.2s
1447 ; CHECK-NEON-NEXT: mov v1.16b, v2.16b
1448 ; CHECK-NEON-NEXT: ret
1450 ; CHECK-SVE-LABEL: umull2_i32:
1451 ; CHECK-SVE: // %bb.0:
1452 ; CHECK-SVE-NEXT: umull2 v2.2d, v0.4s, v1.4s
1453 ; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v1.2s
1454 ; CHECK-SVE-NEXT: mov v1.16b, v2.16b
1455 ; CHECK-SVE-NEXT: ret
1457 ; CHECK-GI-LABEL: umull2_i32:
1458 ; CHECK-GI: // %bb.0:
1459 ; CHECK-GI-NEXT: umull v2.2d, v0.2s, v1.2s
1460 ; CHECK-GI-NEXT: umull2 v1.2d, v0.4s, v1.4s
1461 ; CHECK-GI-NEXT: mov v0.16b, v2.16b
1462 ; CHECK-GI-NEXT: ret
1463 %arg1_ext = zext <4 x i32> %arg1 to <4 x i64>
1464 %arg2_ext = zext <4 x i32> %arg2 to <4 x i64>
1465 %mul = mul <4 x i64> %arg1_ext, %arg2_ext
1469 define <4 x i64> @smull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
1470 ; CHECK-NEON-LABEL: smull2_i32:
1471 ; CHECK-NEON: // %bb.0:
1472 ; CHECK-NEON-NEXT: smull2 v2.2d, v0.4s, v1.4s
1473 ; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s
1474 ; CHECK-NEON-NEXT: mov v1.16b, v2.16b
1475 ; CHECK-NEON-NEXT: ret
1477 ; CHECK-SVE-LABEL: smull2_i32:
1478 ; CHECK-SVE: // %bb.0:
1479 ; CHECK-SVE-NEXT: smull2 v2.2d, v0.4s, v1.4s
1480 ; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s
1481 ; CHECK-SVE-NEXT: mov v1.16b, v2.16b
1482 ; CHECK-SVE-NEXT: ret
1484 ; CHECK-GI-LABEL: smull2_i32:
1485 ; CHECK-GI: // %bb.0:
1486 ; CHECK-GI-NEXT: smull v2.2d, v0.2s, v1.2s
1487 ; CHECK-GI-NEXT: smull2 v1.2d, v0.4s, v1.4s
1488 ; CHECK-GI-NEXT: mov v0.16b, v2.16b
1489 ; CHECK-GI-NEXT: ret
1490 %arg1_ext = sext <4 x i32> %arg1 to <4 x i64>
1491 %arg2_ext = sext <4 x i32> %arg2 to <4 x i64>
1492 %mul = mul <4 x i64> %arg1_ext, %arg2_ext
1496 define <16 x i16> @amull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
1497 ; CHECK-NEON-LABEL: amull2_i8:
1498 ; CHECK-NEON: // %bb.0:
1499 ; CHECK-NEON-NEXT: smull v2.8h, v0.8b, v1.8b
1500 ; CHECK-NEON-NEXT: smull2 v1.8h, v0.16b, v1.16b
1501 ; CHECK-NEON-NEXT: bic v2.8h, #255, lsl #8
1502 ; CHECK-NEON-NEXT: bic v1.8h, #255, lsl #8
1503 ; CHECK-NEON-NEXT: mov v0.16b, v2.16b
1504 ; CHECK-NEON-NEXT: ret
1506 ; CHECK-SVE-LABEL: amull2_i8:
1507 ; CHECK-SVE: // %bb.0:
1508 ; CHECK-SVE-NEXT: smull v2.8h, v0.8b, v1.8b
1509 ; CHECK-SVE-NEXT: smull2 v1.8h, v0.16b, v1.16b
1510 ; CHECK-SVE-NEXT: bic v2.8h, #255, lsl #8
1511 ; CHECK-SVE-NEXT: bic v1.8h, #255, lsl #8
1512 ; CHECK-SVE-NEXT: mov v0.16b, v2.16b
1513 ; CHECK-SVE-NEXT: ret
1515 ; CHECK-GI-LABEL: amull2_i8:
1516 ; CHECK-GI: // %bb.0:
1517 ; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff
1518 ; CHECK-GI-NEXT: umull v3.8h, v0.8b, v1.8b
1519 ; CHECK-GI-NEXT: umull2 v1.8h, v0.16b, v1.16b
1520 ; CHECK-GI-NEXT: and v0.16b, v3.16b, v2.16b
1521 ; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b
1522 ; CHECK-GI-NEXT: ret
1523 %arg1_ext = zext <16 x i8> %arg1 to <16 x i16>
1524 %arg2_ext = zext <16 x i8> %arg2 to <16 x i16>
1525 %mul = mul <16 x i16> %arg1_ext, %arg2_ext
1526 %and = and <16 x i16> %mul, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
1530 define <8 x i32> @amull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
1531 ; CHECK-NEON-LABEL: amull2_i16:
1532 ; CHECK-NEON: // %bb.0:
1533 ; CHECK-NEON-NEXT: movi v2.2d, #0x00ffff0000ffff
1534 ; CHECK-NEON-NEXT: smull v3.4s, v0.4h, v1.4h
1535 ; CHECK-NEON-NEXT: smull2 v0.4s, v0.8h, v1.8h
1536 ; CHECK-NEON-NEXT: and v1.16b, v0.16b, v2.16b
1537 ; CHECK-NEON-NEXT: and v0.16b, v3.16b, v2.16b
1538 ; CHECK-NEON-NEXT: ret
1540 ; CHECK-SVE-LABEL: amull2_i16:
1541 ; CHECK-SVE: // %bb.0:
1542 ; CHECK-SVE-NEXT: movi v2.2d, #0x00ffff0000ffff
1543 ; CHECK-SVE-NEXT: smull v3.4s, v0.4h, v1.4h
1544 ; CHECK-SVE-NEXT: smull2 v0.4s, v0.8h, v1.8h
1545 ; CHECK-SVE-NEXT: and v1.16b, v0.16b, v2.16b
1546 ; CHECK-SVE-NEXT: and v0.16b, v3.16b, v2.16b
1547 ; CHECK-SVE-NEXT: ret
1549 ; CHECK-GI-LABEL: amull2_i16:
1550 ; CHECK-GI: // %bb.0:
1551 ; CHECK-GI-NEXT: movi v2.2d, #0x00ffff0000ffff
1552 ; CHECK-GI-NEXT: umull v3.4s, v0.4h, v1.4h
1553 ; CHECK-GI-NEXT: umull2 v1.4s, v0.8h, v1.8h
1554 ; CHECK-GI-NEXT: and v0.16b, v3.16b, v2.16b
1555 ; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b
1556 ; CHECK-GI-NEXT: ret
1557 %arg1_ext = zext <8 x i16> %arg1 to <8 x i32>
1558 %arg2_ext = zext <8 x i16> %arg2 to <8 x i32>
1559 %mul = mul <8 x i32> %arg1_ext, %arg2_ext
1560 %and = and <8 x i32> %mul, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
1564 define <4 x i64> @amull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
1565 ; CHECK-NEON-LABEL: amull2_i32:
1566 ; CHECK-NEON: // %bb.0:
1567 ; CHECK-NEON-NEXT: movi v2.2d, #0x000000ffffffff
1568 ; CHECK-NEON-NEXT: smull v3.2d, v0.2s, v1.2s
1569 ; CHECK-NEON-NEXT: smull2 v0.2d, v0.4s, v1.4s
1570 ; CHECK-NEON-NEXT: and v1.16b, v0.16b, v2.16b
1571 ; CHECK-NEON-NEXT: and v0.16b, v3.16b, v2.16b
1572 ; CHECK-NEON-NEXT: ret
1574 ; CHECK-SVE-LABEL: amull2_i32:
1575 ; CHECK-SVE: // %bb.0:
1576 ; CHECK-SVE-NEXT: movi v2.2d, #0x000000ffffffff
1577 ; CHECK-SVE-NEXT: smull v3.2d, v0.2s, v1.2s
1578 ; CHECK-SVE-NEXT: smull2 v0.2d, v0.4s, v1.4s
1579 ; CHECK-SVE-NEXT: and v1.16b, v0.16b, v2.16b
1580 ; CHECK-SVE-NEXT: and v0.16b, v3.16b, v2.16b
1581 ; CHECK-SVE-NEXT: ret
1583 ; CHECK-GI-LABEL: amull2_i32:
1584 ; CHECK-GI: // %bb.0:
1585 ; CHECK-GI-NEXT: movi v2.2d, #0x000000ffffffff
1586 ; CHECK-GI-NEXT: umull v3.2d, v0.2s, v1.2s
1587 ; CHECK-GI-NEXT: umull2 v1.2d, v0.4s, v1.4s
1588 ; CHECK-GI-NEXT: and v0.16b, v3.16b, v2.16b
1589 ; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b
1590 ; CHECK-GI-NEXT: ret
1591 %arg1_ext = zext <4 x i32> %arg1 to <4 x i64>
1592 %arg2_ext = zext <4 x i32> %arg2 to <4 x i64>
1593 %mul = mul <4 x i64> %arg1_ext, %arg2_ext
1594 %and = and <4 x i64> %mul, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1599 define <8 x i16> @umull_and_v8i16(<8 x i8> %src1, <8 x i16> %src2) {
1600 ; CHECK-NEON-LABEL: umull_and_v8i16:
1601 ; CHECK-NEON: // %bb.0: // %entry
1602 ; CHECK-NEON-NEXT: bic v1.8h, #255, lsl #8
1603 ; CHECK-NEON-NEXT: xtn v1.8b, v1.8h
1604 ; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v1.8b
1605 ; CHECK-NEON-NEXT: ret
1607 ; CHECK-SVE-LABEL: umull_and_v8i16:
1608 ; CHECK-SVE: // %bb.0: // %entry
1609 ; CHECK-SVE-NEXT: bic v1.8h, #255, lsl #8
1610 ; CHECK-SVE-NEXT: xtn v1.8b, v1.8h
1611 ; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v1.8b
1612 ; CHECK-SVE-NEXT: ret
1614 ; CHECK-GI-LABEL: umull_and_v8i16:
1615 ; CHECK-GI: // %bb.0: // %entry
1616 ; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff
1617 ; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
1618 ; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b
1619 ; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h
1620 ; CHECK-GI-NEXT: ret
1622 %in1 = zext <8 x i8> %src1 to <8 x i16>
1623 %in2 = and <8 x i16> %src2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
1624 %out = mul nsw <8 x i16> %in1, %in2
1628 define <8 x i16> @umull_and_v8i16_c(<8 x i8> %src1, <8 x i16> %src2) {
1629 ; CHECK-NEON-LABEL: umull_and_v8i16_c:
1630 ; CHECK-NEON: // %bb.0: // %entry
1631 ; CHECK-NEON-NEXT: bic v1.8h, #255, lsl #8
1632 ; CHECK-NEON-NEXT: xtn v1.8b, v1.8h
1633 ; CHECK-NEON-NEXT: umull v0.8h, v1.8b, v0.8b
1634 ; CHECK-NEON-NEXT: ret
1636 ; CHECK-SVE-LABEL: umull_and_v8i16_c:
1637 ; CHECK-SVE: // %bb.0: // %entry
1638 ; CHECK-SVE-NEXT: bic v1.8h, #255, lsl #8
1639 ; CHECK-SVE-NEXT: xtn v1.8b, v1.8h
1640 ; CHECK-SVE-NEXT: umull v0.8h, v1.8b, v0.8b
1641 ; CHECK-SVE-NEXT: ret
1643 ; CHECK-GI-LABEL: umull_and_v8i16_c:
1644 ; CHECK-GI: // %bb.0: // %entry
1645 ; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff
1646 ; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
1647 ; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b
1648 ; CHECK-GI-NEXT: mul v0.8h, v1.8h, v0.8h
1649 ; CHECK-GI-NEXT: ret
1651 %in1 = zext <8 x i8> %src1 to <8 x i16>
1652 %in2 = and <8 x i16> %src2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
1653 %out = mul nsw <8 x i16> %in2, %in1
1657 define <8 x i16> @umull_and256_v8i16(<8 x i8> %src1, <8 x i16> %src2) {
1658 ; CHECK-LABEL: umull_and256_v8i16:
1659 ; CHECK: // %bb.0: // %entry
1660 ; CHECK-NEXT: movi v2.8h, #1, lsl #8
1661 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
1662 ; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
1663 ; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h
1666 %in1 = zext <8 x i8> %src1 to <8 x i16>
1667 %in2 = and <8 x i16> %src2, <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256>
1668 %out = mul nsw <8 x i16> %in1, %in2
1672 define <8 x i16> @umull_andconst_v8i16(<8 x i8> %src1, <8 x i16> %src2) {
1673 ; CHECK-NEON-LABEL: umull_andconst_v8i16:
1674 ; CHECK-NEON: // %bb.0: // %entry
1675 ; CHECK-NEON-NEXT: movi v1.2d, #0xffffffffffffffff
1676 ; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v1.8b
1677 ; CHECK-NEON-NEXT: ret
1679 ; CHECK-SVE-LABEL: umull_andconst_v8i16:
1680 ; CHECK-SVE: // %bb.0: // %entry
1681 ; CHECK-SVE-NEXT: movi v1.2d, #0xffffffffffffffff
1682 ; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v1.8b
1683 ; CHECK-SVE-NEXT: ret
1685 ; CHECK-GI-LABEL: umull_andconst_v8i16:
1686 ; CHECK-GI: // %bb.0: // %entry
1687 ; CHECK-GI-NEXT: movi v1.2d, #0xff00ff00ff00ff
1688 ; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
1689 ; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h
1690 ; CHECK-GI-NEXT: ret
1692 %in1 = zext <8 x i8> %src1 to <8 x i16>
1693 %out = mul nsw <8 x i16> %in1, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
1697 define <8 x i16> @umull_smaller_v8i16(<8 x i4> %src1, <8 x i16> %src2) {
1698 ; CHECK-NEON-LABEL: umull_smaller_v8i16:
1699 ; CHECK-NEON: // %bb.0: // %entry
1700 ; CHECK-NEON-NEXT: movi v2.8b, #15
1701 ; CHECK-NEON-NEXT: bic v1.8h, #255, lsl #8
1702 ; CHECK-NEON-NEXT: xtn v1.8b, v1.8h
1703 ; CHECK-NEON-NEXT: and v0.8b, v0.8b, v2.8b
1704 ; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v1.8b
1705 ; CHECK-NEON-NEXT: ret
1707 ; CHECK-SVE-LABEL: umull_smaller_v8i16:
1708 ; CHECK-SVE: // %bb.0: // %entry
1709 ; CHECK-SVE-NEXT: movi v2.8b, #15
1710 ; CHECK-SVE-NEXT: bic v1.8h, #255, lsl #8
1711 ; CHECK-SVE-NEXT: xtn v1.8b, v1.8h
1712 ; CHECK-SVE-NEXT: and v0.8b, v0.8b, v2.8b
1713 ; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v1.8b
1714 ; CHECK-SVE-NEXT: ret
1716 ; CHECK-GI-LABEL: umull_smaller_v8i16:
1717 ; CHECK-GI: // %bb.0: // %entry
1718 ; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff
1719 ; CHECK-GI-NEXT: movi v3.8h, #15
1720 ; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
1721 ; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b
1722 ; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b
1723 ; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h
1724 ; CHECK-GI-NEXT: ret
1726 %in1 = zext <8 x i4> %src1 to <8 x i16>
1727 %in2 = and <8 x i16> %src2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
1728 %out = mul nsw <8 x i16> %in1, %in2
1732 define <4 x i32> @umull_and_v4i32(<4 x i16> %src1, <4 x i32> %src2) {
1733 ; CHECK-NEON-LABEL: umull_and_v4i32:
1734 ; CHECK-NEON: // %bb.0: // %entry
1735 ; CHECK-NEON-NEXT: movi v2.2d, #0x0000ff000000ff
1736 ; CHECK-NEON-NEXT: and v1.16b, v1.16b, v2.16b
1737 ; CHECK-NEON-NEXT: xtn v1.4h, v1.4s
1738 ; CHECK-NEON-NEXT: umull v0.4s, v0.4h, v1.4h
1739 ; CHECK-NEON-NEXT: ret
1741 ; CHECK-SVE-LABEL: umull_and_v4i32:
1742 ; CHECK-SVE: // %bb.0: // %entry
1743 ; CHECK-SVE-NEXT: movi v2.2d, #0x0000ff000000ff
1744 ; CHECK-SVE-NEXT: and v1.16b, v1.16b, v2.16b
1745 ; CHECK-SVE-NEXT: xtn v1.4h, v1.4s
1746 ; CHECK-SVE-NEXT: umull v0.4s, v0.4h, v1.4h
1747 ; CHECK-SVE-NEXT: ret
1749 ; CHECK-GI-LABEL: umull_and_v4i32:
1750 ; CHECK-GI: // %bb.0: // %entry
1751 ; CHECK-GI-NEXT: movi v2.2d, #0x0000ff000000ff
1752 ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
1753 ; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b
1754 ; CHECK-GI-NEXT: mul v0.4s, v0.4s, v1.4s
1755 ; CHECK-GI-NEXT: ret
1757 %in1 = zext <4 x i16> %src1 to <4 x i32>
1758 %in2 = and <4 x i32> %src2, <i32 255, i32 255, i32 255, i32 255>
1759 %out = mul nsw <4 x i32> %in1, %in2
1763 define <8 x i32> @umull_and_v8i32(<8 x i16> %src1, <8 x i32> %src2) {
1764 ; CHECK-NEON-LABEL: umull_and_v8i32:
1765 ; CHECK-NEON: // %bb.0: // %entry
1766 ; CHECK-NEON-NEXT: movi v3.2d, #0x0000ff000000ff
1767 ; CHECK-NEON-NEXT: and v2.16b, v2.16b, v3.16b
1768 ; CHECK-NEON-NEXT: and v1.16b, v1.16b, v3.16b
1769 ; CHECK-NEON-NEXT: uzp1 v2.8h, v1.8h, v2.8h
1770 ; CHECK-NEON-NEXT: umull2 v1.4s, v0.8h, v2.8h
1771 ; CHECK-NEON-NEXT: umull v0.4s, v0.4h, v2.4h
1772 ; CHECK-NEON-NEXT: ret
1774 ; CHECK-SVE-LABEL: umull_and_v8i32:
1775 ; CHECK-SVE: // %bb.0: // %entry
1776 ; CHECK-SVE-NEXT: movi v3.2d, #0x0000ff000000ff
1777 ; CHECK-SVE-NEXT: and v2.16b, v2.16b, v3.16b
1778 ; CHECK-SVE-NEXT: and v1.16b, v1.16b, v3.16b
1779 ; CHECK-SVE-NEXT: uzp1 v2.8h, v1.8h, v2.8h
1780 ; CHECK-SVE-NEXT: umull2 v1.4s, v0.8h, v2.8h
1781 ; CHECK-SVE-NEXT: umull v0.4s, v0.4h, v2.4h
1782 ; CHECK-SVE-NEXT: ret
1784 ; CHECK-GI-LABEL: umull_and_v8i32:
1785 ; CHECK-GI: // %bb.0: // %entry
1786 ; CHECK-GI-NEXT: movi v3.2d, #0x0000ff000000ff
1787 ; CHECK-GI-NEXT: ushll v4.4s, v0.4h, #0
1788 ; CHECK-GI-NEXT: ushll2 v5.4s, v0.8h, #0
1789 ; CHECK-GI-NEXT: and v0.16b, v1.16b, v3.16b
1790 ; CHECK-GI-NEXT: and v1.16b, v2.16b, v3.16b
1791 ; CHECK-GI-NEXT: mul v0.4s, v4.4s, v0.4s
1792 ; CHECK-GI-NEXT: mul v1.4s, v5.4s, v1.4s
1793 ; CHECK-GI-NEXT: ret
1795 %in1 = zext <8 x i16> %src1 to <8 x i32>
1796 %in2 = and <8 x i32> %src2, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
1797 %out = mul nsw <8 x i32> %in1, %in2
1801 define <8 x i32> @umull_and_v8i32_dup(<8 x i16> %src1, i32 %src2) {
1802 ; CHECK-NEON-LABEL: umull_and_v8i32_dup:
1803 ; CHECK-NEON: // %bb.0: // %entry
1804 ; CHECK-NEON-NEXT: and w8, w0, #0xff
1805 ; CHECK-NEON-NEXT: dup v2.8h, w8
1806 ; CHECK-NEON-NEXT: umull2 v1.4s, v0.8h, v2.8h
1807 ; CHECK-NEON-NEXT: umull v0.4s, v0.4h, v2.4h
1808 ; CHECK-NEON-NEXT: ret
1810 ; CHECK-SVE-LABEL: umull_and_v8i32_dup:
1811 ; CHECK-SVE: // %bb.0: // %entry
1812 ; CHECK-SVE-NEXT: and w8, w0, #0xff
1813 ; CHECK-SVE-NEXT: dup v2.8h, w8
1814 ; CHECK-SVE-NEXT: umull2 v1.4s, v0.8h, v2.8h
1815 ; CHECK-SVE-NEXT: umull v0.4s, v0.4h, v2.4h
1816 ; CHECK-SVE-NEXT: ret
1818 ; CHECK-GI-LABEL: umull_and_v8i32_dup:
1819 ; CHECK-GI: // %bb.0: // %entry
1820 ; CHECK-GI-NEXT: and w8, w0, #0xff
1821 ; CHECK-GI-NEXT: ushll v1.4s, v0.4h, #0
1822 ; CHECK-GI-NEXT: ushll2 v2.4s, v0.8h, #0
1823 ; CHECK-GI-NEXT: dup v3.4s, w8
1824 ; CHECK-GI-NEXT: mul v0.4s, v1.4s, v3.4s
1825 ; CHECK-GI-NEXT: mul v1.4s, v2.4s, v3.4s
1826 ; CHECK-GI-NEXT: ret
1828 %in1 = zext <8 x i16> %src1 to <8 x i32>
1829 %in2 = and i32 %src2, 255
1830 %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %in2, i64 0
1831 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
1832 %out = mul nsw <8 x i32> %in1, %broadcast.splat
1836 define <2 x i64> @umull_and_v2i64(<2 x i32> %src1, <2 x i64> %src2) {
1837 ; CHECK-NEON-LABEL: umull_and_v2i64:
1838 ; CHECK-NEON: // %bb.0: // %entry
1839 ; CHECK-NEON-NEXT: movi v2.2d, #0x000000000000ff
1840 ; CHECK-NEON-NEXT: and v1.16b, v1.16b, v2.16b
1841 ; CHECK-NEON-NEXT: xtn v1.2s, v1.2d
1842 ; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v1.2s
1843 ; CHECK-NEON-NEXT: ret
1845 ; CHECK-SVE-LABEL: umull_and_v2i64:
1846 ; CHECK-SVE: // %bb.0: // %entry
1847 ; CHECK-SVE-NEXT: movi v2.2d, #0x000000000000ff
1848 ; CHECK-SVE-NEXT: and v1.16b, v1.16b, v2.16b
1849 ; CHECK-SVE-NEXT: xtn v1.2s, v1.2d
1850 ; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v1.2s
1851 ; CHECK-SVE-NEXT: ret
1853 ; CHECK-GI-LABEL: umull_and_v2i64:
1854 ; CHECK-GI: // %bb.0: // %entry
1855 ; CHECK-GI-NEXT: movi v2.2d, #0x000000000000ff
1856 ; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
1857 ; CHECK-GI-NEXT: fmov x8, d0
1858 ; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b
1859 ; CHECK-GI-NEXT: mov d2, v0.d[1]
1860 ; CHECK-GI-NEXT: mov d3, v1.d[1]
1861 ; CHECK-GI-NEXT: fmov x9, d1
1862 ; CHECK-GI-NEXT: mul x8, x8, x9
1863 ; CHECK-GI-NEXT: fmov x9, d2
1864 ; CHECK-GI-NEXT: fmov x10, d3
1865 ; CHECK-GI-NEXT: mul x9, x9, x10
1866 ; CHECK-GI-NEXT: fmov d0, x8
1867 ; CHECK-GI-NEXT: mov v0.d[1], x9
1868 ; CHECK-GI-NEXT: ret
1870 %in1 = zext <2 x i32> %src1 to <2 x i64>
1871 %in2 = and <2 x i64> %src2, <i64 255, i64 255>
1872 %out = mul nsw <2 x i64> %in1, %in2
1876 define <4 x i64> @umull_and_v4i64(<4 x i32> %src1, <4 x i64> %src2) {
1877 ; CHECK-NEON-LABEL: umull_and_v4i64:
1878 ; CHECK-NEON: // %bb.0: // %entry
1879 ; CHECK-NEON-NEXT: movi v3.2d, #0x000000000000ff
1880 ; CHECK-NEON-NEXT: and v2.16b, v2.16b, v3.16b
1881 ; CHECK-NEON-NEXT: and v1.16b, v1.16b, v3.16b
1882 ; CHECK-NEON-NEXT: uzp1 v2.4s, v1.4s, v2.4s
1883 ; CHECK-NEON-NEXT: umull2 v1.2d, v0.4s, v2.4s
1884 ; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v2.2s
1885 ; CHECK-NEON-NEXT: ret
1887 ; CHECK-SVE-LABEL: umull_and_v4i64:
1888 ; CHECK-SVE: // %bb.0: // %entry
1889 ; CHECK-SVE-NEXT: movi v3.2d, #0x000000000000ff
1890 ; CHECK-SVE-NEXT: and v2.16b, v2.16b, v3.16b
1891 ; CHECK-SVE-NEXT: and v1.16b, v1.16b, v3.16b
1892 ; CHECK-SVE-NEXT: uzp1 v2.4s, v1.4s, v2.4s
1893 ; CHECK-SVE-NEXT: umull2 v1.2d, v0.4s, v2.4s
1894 ; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v2.2s
1895 ; CHECK-SVE-NEXT: ret
1897 ; CHECK-GI-LABEL: umull_and_v4i64:
1898 ; CHECK-GI: // %bb.0: // %entry
1899 ; CHECK-GI-NEXT: movi v3.2d, #0x000000000000ff
1900 ; CHECK-GI-NEXT: ushll v4.2d, v0.2s, #0
1901 ; CHECK-GI-NEXT: ushll2 v0.2d, v0.4s, #0
1902 ; CHECK-GI-NEXT: fmov x8, d4
1903 ; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b
1904 ; CHECK-GI-NEXT: and v2.16b, v2.16b, v3.16b
1905 ; CHECK-GI-NEXT: mov d3, v4.d[1]
1906 ; CHECK-GI-NEXT: fmov x9, d1
1907 ; CHECK-GI-NEXT: mov d4, v1.d[1]
1908 ; CHECK-GI-NEXT: fmov x10, d2
1909 ; CHECK-GI-NEXT: mov d1, v0.d[1]
1910 ; CHECK-GI-NEXT: mul x8, x8, x9
1911 ; CHECK-GI-NEXT: fmov x9, d0
1912 ; CHECK-GI-NEXT: mov d0, v2.d[1]
1913 ; CHECK-GI-NEXT: fmov x11, d4
1914 ; CHECK-GI-NEXT: mul x9, x9, x10
1915 ; CHECK-GI-NEXT: fmov x10, d3
1916 ; CHECK-GI-NEXT: fmov x12, d0
1917 ; CHECK-GI-NEXT: fmov d0, x8
1918 ; CHECK-GI-NEXT: mul x10, x10, x11
1919 ; CHECK-GI-NEXT: fmov x11, d1
1920 ; CHECK-GI-NEXT: fmov d1, x9
1921 ; CHECK-GI-NEXT: mul x11, x11, x12
1922 ; CHECK-GI-NEXT: mov v0.d[1], x10
1923 ; CHECK-GI-NEXT: mov v1.d[1], x11
1924 ; CHECK-GI-NEXT: ret
1926 %in1 = zext <4 x i32> %src1 to <4 x i64>
1927 %in2 = and <4 x i64> %src2, <i64 255, i64 255, i64 255, i64 255>
1928 %out = mul nsw <4 x i64> %in1, %in2
1932 define <4 x i64> @umull_and_v4i64_dup(<4 x i32> %src1, i64 %src2) {
1933 ; CHECK-NEON-LABEL: umull_and_v4i64_dup:
1934 ; CHECK-NEON: // %bb.0: // %entry
1935 ; CHECK-NEON-NEXT: and w8, w0, #0xff
1936 ; CHECK-NEON-NEXT: dup v2.4s, w8
1937 ; CHECK-NEON-NEXT: umull2 v1.2d, v0.4s, v2.4s
1938 ; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v2.2s
1939 ; CHECK-NEON-NEXT: ret
1941 ; CHECK-SVE-LABEL: umull_and_v4i64_dup:
1942 ; CHECK-SVE: // %bb.0: // %entry
1943 ; CHECK-SVE-NEXT: and w8, w0, #0xff
1944 ; CHECK-SVE-NEXT: dup v2.4s, w8
1945 ; CHECK-SVE-NEXT: umull2 v1.2d, v0.4s, v2.4s
1946 ; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v2.2s
1947 ; CHECK-SVE-NEXT: ret
1949 ; CHECK-GI-LABEL: umull_and_v4i64_dup:
1950 ; CHECK-GI: // %bb.0: // %entry
1951 ; CHECK-GI-NEXT: and x8, x0, #0xff
1952 ; CHECK-GI-NEXT: ushll v1.2d, v0.2s, #0
1953 ; CHECK-GI-NEXT: ushll2 v0.2d, v0.4s, #0
1954 ; CHECK-GI-NEXT: dup v2.2d, x8
1955 ; CHECK-GI-NEXT: mov d3, v1.d[1]
1956 ; CHECK-GI-NEXT: fmov x8, d1
1957 ; CHECK-GI-NEXT: fmov x10, d0
1958 ; CHECK-GI-NEXT: mov d1, v2.d[1]
1959 ; CHECK-GI-NEXT: fmov x9, d2
1960 ; CHECK-GI-NEXT: mov d2, v0.d[1]
1961 ; CHECK-GI-NEXT: mul x8, x8, x9
1962 ; CHECK-GI-NEXT: fmov x11, d1
1963 ; CHECK-GI-NEXT: fmov x12, d2
1964 ; CHECK-GI-NEXT: mul x9, x10, x9
1965 ; CHECK-GI-NEXT: fmov x10, d3
1966 ; CHECK-GI-NEXT: mul x10, x10, x11
1967 ; CHECK-GI-NEXT: fmov d0, x8
1968 ; CHECK-GI-NEXT: mul x11, x12, x11
1969 ; CHECK-GI-NEXT: fmov d1, x9
1970 ; CHECK-GI-NEXT: mov v0.d[1], x10
1971 ; CHECK-GI-NEXT: mov v1.d[1], x11
1972 ; CHECK-GI-NEXT: ret
1974 %in1 = zext <4 x i32> %src1 to <4 x i64>
1975 %in2 = and i64 %src2, 255
1976 %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %in2, i64 0
1977 %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer
1978 %out = mul nsw <4 x i64> %in1, %broadcast.splat
1982 define void @pmlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) {
1983 ; CHECK-LABEL: pmlsl2_v8i16_uzp1:
1985 ; CHECK-NEXT: ldr q2, [x1, #16]
1986 ; CHECK-NEXT: uzp1 v2.16b, v0.16b, v2.16b
1987 ; CHECK-NEXT: pmull2 v0.8h, v0.16b, v2.16b
1988 ; CHECK-NEXT: sub v0.8h, v1.8h, v0.8h
1989 ; CHECK-NEXT: str q0, [x0]
1991 %5 = getelementptr inbounds i32, ptr %3, i64 4
1992 %6 = load <8 x i16>, ptr %5, align 4
1993 %7 = trunc <8 x i16> %6 to <8 x i8>
1994 %8 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1995 %9 = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %8, <8 x i8> %7)
1996 %10 = sub <8 x i16> %1, %9
1997 store <8 x i16> %10, ptr %2, align 16
2001 define void @smlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) {
2002 ; CHECK-LABEL: smlsl2_v8i16_uzp1:
2004 ; CHECK-NEXT: ldr q2, [x1, #16]
2005 ; CHECK-NEXT: uzp1 v2.16b, v0.16b, v2.16b
2006 ; CHECK-NEXT: smlsl2 v1.8h, v0.16b, v2.16b
2007 ; CHECK-NEXT: str q1, [x0]
2009 %5 = getelementptr inbounds i32, ptr %3, i64 4
2010 %6 = load <8 x i16>, ptr %5, align 4
2011 %7 = trunc <8 x i16> %6 to <8 x i8>
2012 %8 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2013 %9 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %8, <8 x i8> %7)
2014 %10 = sub <8 x i16> %1, %9
2015 store <8 x i16> %10, ptr %2, align 16
2019 define void @umlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) {
2020 ; CHECK-LABEL: umlsl2_v8i16_uzp1:
2022 ; CHECK-NEXT: ldr q2, [x1, #16]
2023 ; CHECK-NEXT: uzp1 v2.16b, v0.16b, v2.16b
2024 ; CHECK-NEXT: umlsl2 v1.8h, v0.16b, v2.16b
2025 ; CHECK-NEXT: str q1, [x0]
2027 %5 = getelementptr inbounds i32, ptr %3, i64 4
2028 %6 = load <8 x i16>, ptr %5, align 4
2029 %7 = trunc <8 x i16> %6 to <8 x i8>
2030 %8 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2031 %9 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %8, <8 x i8> %7)
2032 %10 = sub <8 x i16> %1, %9
2033 store <8 x i16> %10, ptr %2, align 16
2037 define void @smlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3) {
2038 ; CHECK-LABEL: smlsl2_v4i32_uzp1:
2040 ; CHECK-NEXT: ldr q2, [x1, #16]
2041 ; CHECK-NEXT: uzp1 v2.8h, v0.8h, v2.8h
2042 ; CHECK-NEXT: smlsl2 v1.4s, v0.8h, v2.8h
2043 ; CHECK-NEXT: str q1, [x0]
2045 %5 = getelementptr inbounds i32, ptr %3, i64 4
2046 %6 = load <4 x i32>, ptr %5, align 4
2047 %7 = trunc <4 x i32> %6 to <4 x i16>
2048 %8 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2049 %9 = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %8, <4 x i16> %7)
2050 %10 = sub <4 x i32> %1, %9
2051 store <4 x i32> %10, ptr %2, align 16
2055 define void @umlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3) {
2056 ; CHECK-LABEL: umlsl2_v4i32_uzp1:
2058 ; CHECK-NEXT: ldr q2, [x1, #16]
2059 ; CHECK-NEXT: uzp1 v2.8h, v0.8h, v2.8h
2060 ; CHECK-NEXT: umlsl2 v1.4s, v0.8h, v2.8h
2061 ; CHECK-NEXT: str q1, [x0]
2063 %5 = getelementptr inbounds i32, ptr %3, i64 4
2064 %6 = load <4 x i32>, ptr %5, align 4
2065 %7 = trunc <4 x i32> %6 to <4 x i16>
2066 %8 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2067 %9 = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %8, <4 x i16> %7)
2068 %10 = sub <4 x i32> %1, %9
2069 store <4 x i32> %10, ptr %2, align 16
2073 define void @pmlsl_pmlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3, i32 %4) {
2074 ; CHECK-LABEL: pmlsl_pmlsl2_v8i16_uzp1:
2075 ; CHECK: // %bb.0: // %entry
2076 ; CHECK-NEXT: ldp q2, q3, [x1]
2077 ; CHECK-NEXT: uzp1 v2.16b, v2.16b, v3.16b
2078 ; CHECK-NEXT: pmull v3.8h, v0.8b, v2.8b
2079 ; CHECK-NEXT: pmull2 v0.8h, v0.16b, v2.16b
2080 ; CHECK-NEXT: add v0.8h, v3.8h, v0.8h
2081 ; CHECK-NEXT: sub v0.8h, v1.8h, v0.8h
2082 ; CHECK-NEXT: str q0, [x0]
2085 %5 = load <8 x i16>, ptr %3, align 4
2086 %6 = trunc <8 x i16> %5 to <8 x i8>
2087 %7 = getelementptr inbounds i32, ptr %3, i64 4
2088 %8 = load <8 x i16>, ptr %7, align 4
2089 %9 = trunc <8 x i16> %8 to <8 x i8>
2090 %10 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2091 %11 = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %10, <8 x i8> %6)
2092 %12 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2093 %13 = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %12, <8 x i8> %9)
2094 %14 = add <8 x i16> %11, %13
2095 %15 = sub <8 x i16> %1, %14
2096 store <8 x i16> %15, ptr %2, align 16
2100 define void @smlsl_smlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3, i32 %4) {
2101 ; CHECK-LABEL: smlsl_smlsl2_v8i16_uzp1:
2102 ; CHECK: // %bb.0: // %entry
2103 ; CHECK-NEXT: ldp q2, q3, [x1]
2104 ; CHECK-NEXT: uzp1 v2.16b, v2.16b, v3.16b
2105 ; CHECK-NEXT: smlsl v1.8h, v0.8b, v2.8b
2106 ; CHECK-NEXT: smlsl2 v1.8h, v0.16b, v2.16b
2107 ; CHECK-NEXT: str q1, [x0]
2110 %5 = load <8 x i16>, ptr %3, align 4
2111 %6 = trunc <8 x i16> %5 to <8 x i8>
2112 %7 = getelementptr inbounds i32, ptr %3, i64 4
2113 %8 = load <8 x i16>, ptr %7, align 4
2114 %9 = trunc <8 x i16> %8 to <8 x i8>
2115 %10 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2116 %11 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %10, <8 x i8> %6)
2117 %12 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2118 %13 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %12, <8 x i8> %9)
2119 %14 = add <8 x i16> %11, %13
2120 %15 = sub <8 x i16> %1, %14
2121 store <8 x i16> %15, ptr %2, align 16
2125 define void @umlsl_umlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3, i32 %4) {
2126 ; CHECK-LABEL: umlsl_umlsl2_v8i16_uzp1:
2127 ; CHECK: // %bb.0: // %entry
2128 ; CHECK-NEXT: ldp q2, q3, [x1]
2129 ; CHECK-NEXT: uzp1 v2.16b, v2.16b, v3.16b
2130 ; CHECK-NEXT: umlsl v1.8h, v0.8b, v2.8b
2131 ; CHECK-NEXT: umlsl2 v1.8h, v0.16b, v2.16b
2132 ; CHECK-NEXT: str q1, [x0]
2135 %5 = load <8 x i16>, ptr %3, align 4
2136 %6 = trunc <8 x i16> %5 to <8 x i8>
2137 %7 = getelementptr inbounds i32, ptr %3, i64 4
2138 %8 = load <8 x i16>, ptr %7, align 4
2139 %9 = trunc <8 x i16> %8 to <8 x i8>
2140 %10 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2141 %11 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %10, <8 x i8> %6)
2142 %12 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2143 %13 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %12, <8 x i8> %9)
2144 %14 = add <8 x i16> %11, %13
2145 %15 = sub <8 x i16> %1, %14
2146 store <8 x i16> %15, ptr %2, align 16
2150 define void @smlsl_smlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3, i32 %4) {
2151 ; CHECK-LABEL: smlsl_smlsl2_v4i32_uzp1:
2152 ; CHECK: // %bb.0: // %entry
2153 ; CHECK-NEXT: ldp q2, q3, [x1]
2154 ; CHECK-NEXT: uzp1 v2.8h, v2.8h, v3.8h
2155 ; CHECK-NEXT: smlsl v1.4s, v0.4h, v2.4h
2156 ; CHECK-NEXT: smlsl2 v1.4s, v0.8h, v2.8h
2157 ; CHECK-NEXT: str q1, [x0]
2160 %5 = load <4 x i32>, ptr %3, align 4
2161 %6 = trunc <4 x i32> %5 to <4 x i16>
2162 %7 = getelementptr inbounds i32, ptr %3, i64 4
2163 %8 = load <4 x i32>, ptr %7, align 4
2164 %9 = trunc <4 x i32> %8 to <4 x i16>
2165 %10 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2166 %11 = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %10, <4 x i16> %6)
2167 %12 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2168 %13 = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %12, <4 x i16> %9)
2169 %14 = add <4 x i32> %11, %13
2170 %15 = sub <4 x i32> %1, %14
2171 store <4 x i32> %15, ptr %2, align 16
2175 define void @umlsl_umlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3, i32 %4) {
2176 ; CHECK-LABEL: umlsl_umlsl2_v4i32_uzp1:
2177 ; CHECK: // %bb.0: // %entry
2178 ; CHECK-NEXT: ldp q2, q3, [x1]
2179 ; CHECK-NEXT: uzp1 v2.8h, v2.8h, v3.8h
2180 ; CHECK-NEXT: umlsl v1.4s, v0.4h, v2.4h
2181 ; CHECK-NEXT: umlsl2 v1.4s, v0.8h, v2.8h
2182 ; CHECK-NEXT: str q1, [x0]
2185 %5 = load <4 x i32>, ptr %3, align 4
2186 %6 = trunc <4 x i32> %5 to <4 x i16>
2187 %7 = getelementptr inbounds i32, ptr %3, i64 4
2188 %8 = load <4 x i32>, ptr %7, align 4
2189 %9 = trunc <4 x i32> %8 to <4 x i16>
2190 %10 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2191 %11 = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %10, <4 x i16> %6)
2192 %12 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2193 %13 = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %12, <4 x i16> %9)
2194 %14 = add <4 x i32> %11, %13
2195 %15 = sub <4 x i32> %1, %14
2196 store <4 x i32> %15, ptr %2, align 16
2200 define <2 x i32> @do_stuff(<2 x i64> %0, <2 x i64> %1) {
2201 ; CHECK-LABEL: do_stuff:
2203 ; CHECK-NEXT: uzp1 v0.4s, v0.4s, v0.4s
2204 ; CHECK-NEXT: smull2 v0.2d, v1.4s, v0.4s
2205 ; CHECK-NEXT: xtn v0.2s, v0.2d
2206 ; CHECK-NEXT: add v0.2s, v0.2s, v1.2s
2208 %bc.1 = bitcast <2 x i64> %1 to <4 x i32>
2209 %trunc.0 = trunc <2 x i64> %0 to <2 x i32>
2210 %shuff.hi = shufflevector <4 x i32> %bc.1, <4 x i32> zeroinitializer, <2 x i32> <i32 2, i32 3>
2211 %shuff.lo = shufflevector <4 x i32> %bc.1, <4 x i32> zeroinitializer, <2 x i32> <i32 0, i32 1>
2212 %smull = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuff.hi, <2 x i32> %trunc.0)
2213 %trunc.smull = trunc <2 x i64> %smull to <2 x i32>
2214 %final = add <2 x i32> %trunc.smull, %shuff.lo
2215 ret <2 x i32> %final
2218 declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>)
2219 declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>)
2220 declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>)
2221 declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
2222 declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
2223 declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)