1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,X86-SSE
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,X64-SSE
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefixes=X64-AVX,X64-XOP
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64-AVX,X64-AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X64-AVX,X64-AVX512DQ
12 define <2 x i64> @mul_v2i64_8(<2 x i64> %a0) nounwind {
13 ; SSE-LABEL: mul_v2i64_8:
15 ; SSE-NEXT: psllq $3, %xmm0
16 ; SSE-NEXT: ret{{[l|q]}}
18 ; X64-AVX-LABEL: mul_v2i64_8:
20 ; X64-AVX-NEXT: vpsllq $3, %xmm0, %xmm0
22 %1 = mul <2 x i64> %a0, <i64 8, i64 8>
26 define <4 x i32> @mul_v4i32_8(<4 x i32> %a0) nounwind {
27 ; SSE-LABEL: mul_v4i32_8:
29 ; SSE-NEXT: pslld $3, %xmm0
30 ; SSE-NEXT: ret{{[l|q]}}
32 ; X64-AVX-LABEL: mul_v4i32_8:
34 ; X64-AVX-NEXT: vpslld $3, %xmm0, %xmm0
36 %1 = mul <4 x i32> %a0, <i32 8, i32 8, i32 8, i32 8>
40 define <8 x i16> @mul_v8i16_8(<8 x i16> %a0) nounwind {
41 ; SSE-LABEL: mul_v8i16_8:
43 ; SSE-NEXT: psllw $3, %xmm0
44 ; SSE-NEXT: ret{{[l|q]}}
46 ; X64-AVX-LABEL: mul_v8i16_8:
48 ; X64-AVX-NEXT: vpsllw $3, %xmm0, %xmm0
50 %1 = mul <8 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
54 define <16 x i8> @mul_v16i8_32(<16 x i8> %a0) nounwind {
55 ; X86-SSE-LABEL: mul_v16i8_32:
57 ; X86-SSE-NEXT: psllw $5, %xmm0
58 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
61 ; X64-SSE-LABEL: mul_v16i8_32:
63 ; X64-SSE-NEXT: psllw $5, %xmm0
64 ; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
67 ; X64-XOP-LABEL: mul_v16i8_32:
69 ; X64-XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
72 ; X64-AVX2-LABEL: mul_v16i8_32:
74 ; X64-AVX2-NEXT: vpsllw $5, %xmm0, %xmm0
75 ; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
78 ; X64-AVX512DQ-LABEL: mul_v16i8_32:
79 ; X64-AVX512DQ: # %bb.0:
80 ; X64-AVX512DQ-NEXT: vpsllw $5, %xmm0, %xmm0
81 ; X64-AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
82 ; X64-AVX512DQ-NEXT: retq
83 %1 = mul <16 x i8> %a0, <i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32>
88 ; PowOf2 (non-uniform)
91 define <2 x i64> @mul_v2i64_32_8(<2 x i64> %a0) nounwind {
92 ; SSE-LABEL: mul_v2i64_32_8:
94 ; SSE-NEXT: movdqa %xmm0, %xmm1
95 ; SSE-NEXT: psllq $3, %xmm1
96 ; SSE-NEXT: psllq $5, %xmm0
97 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
98 ; SSE-NEXT: ret{{[l|q]}}
100 ; X64-XOP-LABEL: mul_v2i64_32_8:
102 ; X64-XOP-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
105 ; X64-AVX2-LABEL: mul_v2i64_32_8:
107 ; X64-AVX2-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
108 ; X64-AVX2-NEXT: retq
110 ; X64-AVX512DQ-LABEL: mul_v2i64_32_8:
111 ; X64-AVX512DQ: # %bb.0:
112 ; X64-AVX512DQ-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
113 ; X64-AVX512DQ-NEXT: retq
114 %1 = mul <2 x i64> %a0, <i64 32, i64 8>
118 define <4 x i32> @mul_v4i32_1_2_4_8(<4 x i32> %a0) nounwind {
119 ; X86-SSE-LABEL: mul_v4i32_1_2_4_8:
121 ; X86-SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
124 ; X64-SSE-LABEL: mul_v4i32_1_2_4_8:
126 ; X64-SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
129 ; X64-XOP-LABEL: mul_v4i32_1_2_4_8:
131 ; X64-XOP-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
134 ; X64-AVX2-LABEL: mul_v4i32_1_2_4_8:
136 ; X64-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
137 ; X64-AVX2-NEXT: retq
139 ; X64-AVX512DQ-LABEL: mul_v4i32_1_2_4_8:
140 ; X64-AVX512DQ: # %bb.0:
141 ; X64-AVX512DQ-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
142 ; X64-AVX512DQ-NEXT: retq
143 %1 = mul <4 x i32> %a0, <i32 1, i32 2, i32 4, i32 8>
147 define <8 x i16> @mul_v8i16_1_2_4_8_16_32_64_128(<8 x i16> %a0) nounwind {
148 ; X86-SSE-LABEL: mul_v8i16_1_2_4_8_16_32_64_128:
150 ; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
153 ; X64-SSE-LABEL: mul_v8i16_1_2_4_8_16_32_64_128:
155 ; X64-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
158 ; X64-XOP-LABEL: mul_v8i16_1_2_4_8_16_32_64_128:
160 ; X64-XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
163 ; X64-AVX2-LABEL: mul_v8i16_1_2_4_8_16_32_64_128:
165 ; X64-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
166 ; X64-AVX2-NEXT: retq
168 ; X64-AVX512DQ-LABEL: mul_v8i16_1_2_4_8_16_32_64_128:
169 ; X64-AVX512DQ: # %bb.0:
170 ; X64-AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
171 ; X64-AVX512DQ-NEXT: retq
172 %1 = mul <8 x i16> %a0, <i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128>
176 define <16 x i8> @mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8(<16 x i8> %a0) nounwind {
177 ; SSE-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
179 ; SSE-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
180 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
181 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,1,2,4,8]
182 ; SSE-NEXT: pmullw %xmm2, %xmm0
183 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
184 ; SSE-NEXT: pand %xmm3, %xmm0
185 ; SSE-NEXT: pmullw %xmm2, %xmm1
186 ; SSE-NEXT: pand %xmm3, %xmm1
187 ; SSE-NEXT: packuswb %xmm0, %xmm1
188 ; SSE-NEXT: movdqa %xmm1, %xmm0
189 ; SSE-NEXT: ret{{[l|q]}}
191 ; X64-XOP-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
193 ; X64-XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
196 ; X64-AVX2-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
198 ; X64-AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
199 ; X64-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
200 ; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
201 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
202 ; X64-AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
203 ; X64-AVX2-NEXT: vzeroupper
204 ; X64-AVX2-NEXT: retq
206 ; X64-AVX512DQ-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
207 ; X64-AVX512DQ: # %bb.0:
208 ; X64-AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
209 ; X64-AVX512DQ-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
210 ; X64-AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
211 ; X64-AVX512DQ-NEXT: vzeroupper
212 ; X64-AVX512DQ-NEXT: retq
213 %1 = mul <16 x i8> %a0, <i8 1, i8 2, i8 4, i8 8, i8 1, i8 2, i8 4, i8 8, i8 1, i8 2, i8 4, i8 8, i8 1, i8 2, i8 4, i8 8>
218 ; PowOf2 + 1 (uniform)
221 define <2 x i64> @mul_v2i64_17(<2 x i64> %a0) nounwind {
222 ; SSE-LABEL: mul_v2i64_17:
224 ; SSE-NEXT: movdqa %xmm0, %xmm1
225 ; SSE-NEXT: psllq $4, %xmm1
226 ; SSE-NEXT: paddq %xmm0, %xmm1
227 ; SSE-NEXT: movdqa %xmm1, %xmm0
228 ; SSE-NEXT: ret{{[l|q]}}
230 ; X64-XOP-LABEL: mul_v2i64_17:
232 ; X64-XOP-NEXT: vpsllq $4, %xmm0, %xmm1
233 ; X64-XOP-NEXT: vpaddq %xmm0, %xmm1, %xmm0
236 ; X64-AVX2-LABEL: mul_v2i64_17:
238 ; X64-AVX2-NEXT: vpsllq $4, %xmm0, %xmm1
239 ; X64-AVX2-NEXT: vpaddq %xmm0, %xmm1, %xmm0
240 ; X64-AVX2-NEXT: retq
242 ; X64-AVX512DQ-LABEL: mul_v2i64_17:
243 ; X64-AVX512DQ: # %bb.0:
244 ; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
245 ; X64-AVX512DQ-NEXT: retq
246 %1 = mul <2 x i64> %a0, <i64 17, i64 17>
250 define <4 x i32> @mul_v4i32_17(<4 x i32> %a0) nounwind {
251 ; X86-SSE-LABEL: mul_v4i32_17:
253 ; X86-SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
256 ; X64-SSE-LABEL: mul_v4i32_17:
258 ; X64-SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
261 ; X64-XOP-LABEL: mul_v4i32_17:
263 ; X64-XOP-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
266 ; X64-AVX2-LABEL: mul_v4i32_17:
268 ; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [17,17,17,17]
269 ; X64-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
270 ; X64-AVX2-NEXT: retq
272 ; X64-AVX512DQ-LABEL: mul_v4i32_17:
273 ; X64-AVX512DQ: # %bb.0:
274 ; X64-AVX512DQ-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
275 ; X64-AVX512DQ-NEXT: retq
276 %1 = mul <4 x i32> %a0, <i32 17, i32 17, i32 17, i32 17>
280 define <8 x i16> @mul_v8i16_17(<8 x i16> %a0) nounwind {
281 ; X86-SSE-LABEL: mul_v8i16_17:
283 ; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
286 ; X64-SSE-LABEL: mul_v8i16_17:
288 ; X64-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
291 ; X64-AVX-LABEL: mul_v8i16_17:
293 ; X64-AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
295 %1 = mul <8 x i16> %a0, <i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17>
299 define <16 x i8> @mul_v16i8_17(<16 x i8> %a0) nounwind {
300 ; X86-SSE-LABEL: mul_v16i8_17:
302 ; X86-SSE-NEXT: movdqa %xmm0, %xmm1
303 ; X86-SSE-NEXT: psllw $4, %xmm1
304 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
305 ; X86-SSE-NEXT: paddb %xmm0, %xmm1
306 ; X86-SSE-NEXT: movdqa %xmm1, %xmm0
309 ; X64-SSE-LABEL: mul_v16i8_17:
311 ; X64-SSE-NEXT: movdqa %xmm0, %xmm1
312 ; X64-SSE-NEXT: psllw $4, %xmm1
313 ; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
314 ; X64-SSE-NEXT: paddb %xmm0, %xmm1
315 ; X64-SSE-NEXT: movdqa %xmm1, %xmm0
318 ; X64-XOP-LABEL: mul_v16i8_17:
320 ; X64-XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
321 ; X64-XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0
324 ; X64-AVX2-LABEL: mul_v16i8_17:
326 ; X64-AVX2-NEXT: vpsllw $4, %xmm0, %xmm1
327 ; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
328 ; X64-AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
329 ; X64-AVX2-NEXT: retq
331 ; X64-AVX512DQ-LABEL: mul_v16i8_17:
332 ; X64-AVX512DQ: # %bb.0:
333 ; X64-AVX512DQ-NEXT: vpsllw $4, %xmm0, %xmm1
334 ; X64-AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
335 ; X64-AVX512DQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0
336 ; X64-AVX512DQ-NEXT: retq
337 %1 = mul <16 x i8> %a0, <i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17>
341 define <4 x i64> @mul_v4i64_17(<4 x i64> %a0) nounwind {
342 ; SSE-LABEL: mul_v4i64_17:
344 ; SSE-NEXT: movdqa %xmm0, %xmm2
345 ; SSE-NEXT: psllq $4, %xmm2
346 ; SSE-NEXT: paddq %xmm0, %xmm2
347 ; SSE-NEXT: movdqa %xmm1, %xmm3
348 ; SSE-NEXT: psllq $4, %xmm3
349 ; SSE-NEXT: paddq %xmm1, %xmm3
350 ; SSE-NEXT: movdqa %xmm2, %xmm0
351 ; SSE-NEXT: movdqa %xmm3, %xmm1
352 ; SSE-NEXT: ret{{[l|q]}}
354 ; X64-XOP-LABEL: mul_v4i64_17:
356 ; X64-XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
357 ; X64-XOP-NEXT: vpsllq $4, %xmm1, %xmm2
358 ; X64-XOP-NEXT: vpaddq %xmm1, %xmm2, %xmm1
359 ; X64-XOP-NEXT: vpsllq $4, %xmm0, %xmm2
360 ; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
361 ; X64-XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
364 ; X64-AVX2-LABEL: mul_v4i64_17:
366 ; X64-AVX2-NEXT: vpsllq $4, %ymm0, %ymm1
367 ; X64-AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0
368 ; X64-AVX2-NEXT: retq
370 ; X64-AVX512DQ-LABEL: mul_v4i64_17:
371 ; X64-AVX512DQ: # %bb.0:
372 ; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
373 ; X64-AVX512DQ-NEXT: retq
374 %1 = mul <4 x i64> %a0, <i64 17, i64 17, i64 17, i64 17>
378 define <8 x i32> @mul_v8i32_17(<8 x i32> %a0) nounwind {
379 ; SSE-LABEL: mul_v8i32_17:
381 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17]
382 ; SSE-NEXT: pmulld %xmm2, %xmm0
383 ; SSE-NEXT: pmulld %xmm2, %xmm1
384 ; SSE-NEXT: ret{{[l|q]}}
386 ; X64-XOP-LABEL: mul_v8i32_17:
388 ; X64-XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
389 ; X64-XOP-NEXT: vpslld $4, %xmm1, %xmm2
390 ; X64-XOP-NEXT: vpaddd %xmm1, %xmm2, %xmm1
391 ; X64-XOP-NEXT: vpslld $4, %xmm0, %xmm2
392 ; X64-XOP-NEXT: vpaddd %xmm0, %xmm2, %xmm0
393 ; X64-XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
396 ; X64-AVX2-LABEL: mul_v8i32_17:
398 ; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [17,17,17,17,17,17,17,17]
399 ; X64-AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
400 ; X64-AVX2-NEXT: retq
402 ; X64-AVX512DQ-LABEL: mul_v8i32_17:
403 ; X64-AVX512DQ: # %bb.0:
404 ; X64-AVX512DQ-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
405 ; X64-AVX512DQ-NEXT: retq
406 %1 = mul <8 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
410 define <16 x i16> @mul_v16i16_17(<16 x i16> %a0) nounwind {
411 ; SSE-LABEL: mul_v16i16_17:
413 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17,17,17,17,17]
414 ; SSE-NEXT: pmullw %xmm2, %xmm0
415 ; SSE-NEXT: pmullw %xmm2, %xmm1
416 ; SSE-NEXT: ret{{[l|q]}}
418 ; X64-XOP-LABEL: mul_v16i16_17:
420 ; X64-XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
421 ; X64-XOP-NEXT: vpsllw $4, %xmm1, %xmm2
422 ; X64-XOP-NEXT: vpaddw %xmm1, %xmm2, %xmm1
423 ; X64-XOP-NEXT: vpsllw $4, %xmm0, %xmm2
424 ; X64-XOP-NEXT: vpaddw %xmm0, %xmm2, %xmm0
425 ; X64-XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
428 ; X64-AVX2-LABEL: mul_v16i16_17:
430 ; X64-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
431 ; X64-AVX2-NEXT: retq
433 ; X64-AVX512DQ-LABEL: mul_v16i16_17:
434 ; X64-AVX512DQ: # %bb.0:
435 ; X64-AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
436 ; X64-AVX512DQ-NEXT: retq
437 %1 = mul <16 x i16> %a0, <i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17>
441 define <32 x i8> @mul_v32i8_17(<32 x i8> %a0) nounwind {
442 ; SSE-LABEL: mul_v32i8_17:
444 ; SSE-NEXT: movdqa %xmm0, %xmm2
445 ; SSE-NEXT: psllw $4, %xmm2
446 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
447 ; SSE-NEXT: pand %xmm4, %xmm2
448 ; SSE-NEXT: paddb %xmm0, %xmm2
449 ; SSE-NEXT: movdqa %xmm1, %xmm3
450 ; SSE-NEXT: psllw $4, %xmm3
451 ; SSE-NEXT: pand %xmm4, %xmm3
452 ; SSE-NEXT: paddb %xmm1, %xmm3
453 ; SSE-NEXT: movdqa %xmm2, %xmm0
454 ; SSE-NEXT: movdqa %xmm3, %xmm1
455 ; SSE-NEXT: ret{{[l|q]}}
457 ; X64-XOP-LABEL: mul_v32i8_17:
459 ; X64-XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
460 ; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
461 ; X64-XOP-NEXT: vpshlb %xmm2, %xmm1, %xmm3
462 ; X64-XOP-NEXT: vpaddb %xmm1, %xmm3, %xmm1
463 ; X64-XOP-NEXT: vpshlb %xmm2, %xmm0, %xmm2
464 ; X64-XOP-NEXT: vpaddb %xmm0, %xmm2, %xmm0
465 ; X64-XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
468 ; X64-AVX2-LABEL: mul_v32i8_17:
470 ; X64-AVX2-NEXT: vpsllw $4, %ymm0, %ymm1
471 ; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
472 ; X64-AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
473 ; X64-AVX2-NEXT: retq
475 ; X64-AVX512DQ-LABEL: mul_v32i8_17:
476 ; X64-AVX512DQ: # %bb.0:
477 ; X64-AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm1
478 ; X64-AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
479 ; X64-AVX512DQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0
480 ; X64-AVX512DQ-NEXT: retq
481 %1 = mul <32 x i8> %a0, <i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17>
486 ; -(PowOf2 + 1) (uniform)
489 define <2 x i64> @mul_v2i64_neg1025(<2 x i64> %a0) nounwind {
490 ; SSE-LABEL: mul_v2i64_neg1025:
492 ; SSE-NEXT: movdqa %xmm0, %xmm1
493 ; SSE-NEXT: psllq $10, %xmm1
494 ; SSE-NEXT: paddq %xmm0, %xmm1
495 ; SSE-NEXT: pxor %xmm0, %xmm0
496 ; SSE-NEXT: psubq %xmm1, %xmm0
497 ; SSE-NEXT: ret{{[l|q]}}
499 ; X64-XOP-LABEL: mul_v2i64_neg1025:
501 ; X64-XOP-NEXT: vpsllq $10, %xmm0, %xmm1
502 ; X64-XOP-NEXT: vpaddq %xmm0, %xmm1, %xmm0
503 ; X64-XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
504 ; X64-XOP-NEXT: vpsubq %xmm0, %xmm1, %xmm0
507 ; X64-AVX2-LABEL: mul_v2i64_neg1025:
509 ; X64-AVX2-NEXT: vpsllq $10, %xmm0, %xmm1
510 ; X64-AVX2-NEXT: vpaddq %xmm0, %xmm1, %xmm0
511 ; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
512 ; X64-AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm0
513 ; X64-AVX2-NEXT: retq
515 ; X64-AVX512DQ-LABEL: mul_v2i64_neg1025:
516 ; X64-AVX512DQ: # %bb.0:
517 ; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
518 ; X64-AVX512DQ-NEXT: retq
519 %1 = mul <2 x i64> %a0, <i64 -1025, i64 -1025>
523 define <4 x i32> @mul_v4i32_neg33(<4 x i32> %a0) nounwind {
524 ; X86-SSE-LABEL: mul_v4i32_neg33:
526 ; X86-SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
529 ; X64-SSE-LABEL: mul_v4i32_neg33:
531 ; X64-SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
534 ; X64-XOP-LABEL: mul_v4i32_neg33:
536 ; X64-XOP-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
539 ; X64-AVX2-LABEL: mul_v4i32_neg33:
541 ; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967263,4294967263,4294967263,4294967263]
542 ; X64-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
543 ; X64-AVX2-NEXT: retq
545 ; X64-AVX512DQ-LABEL: mul_v4i32_neg33:
546 ; X64-AVX512DQ: # %bb.0:
547 ; X64-AVX512DQ-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
548 ; X64-AVX512DQ-NEXT: retq
549 %1 = mul <4 x i32> %a0, <i32 -33, i32 -33, i32 -33, i32 -33>
553 define <8 x i16> @mul_v8i16_neg9(<8 x i16> %a0) nounwind {
554 ; X86-SSE-LABEL: mul_v8i16_neg9:
556 ; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
559 ; X64-SSE-LABEL: mul_v8i16_neg9:
561 ; X64-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
564 ; X64-AVX-LABEL: mul_v8i16_neg9:
566 ; X64-AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
568 %1 = mul <8 x i16> %a0, <i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9>
572 define <16 x i8> @mul_v16i8_neg5(<16 x i8> %a0) nounwind {
573 ; X86-SSE-LABEL: mul_v16i8_neg5:
575 ; X86-SSE-NEXT: movdqa %xmm0, %xmm1
576 ; X86-SSE-NEXT: psllw $2, %xmm1
577 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
578 ; X86-SSE-NEXT: paddb %xmm0, %xmm1
579 ; X86-SSE-NEXT: pxor %xmm0, %xmm0
580 ; X86-SSE-NEXT: psubb %xmm1, %xmm0
583 ; X64-SSE-LABEL: mul_v16i8_neg5:
585 ; X64-SSE-NEXT: movdqa %xmm0, %xmm1
586 ; X64-SSE-NEXT: psllw $2, %xmm1
587 ; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
588 ; X64-SSE-NEXT: paddb %xmm0, %xmm1
589 ; X64-SSE-NEXT: pxor %xmm0, %xmm0
590 ; X64-SSE-NEXT: psubb %xmm1, %xmm0
593 ; X64-XOP-LABEL: mul_v16i8_neg5:
595 ; X64-XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
596 ; X64-XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0
597 ; X64-XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
598 ; X64-XOP-NEXT: vpsubb %xmm0, %xmm1, %xmm0
601 ; X64-AVX2-LABEL: mul_v16i8_neg5:
603 ; X64-AVX2-NEXT: vpsllw $2, %xmm0, %xmm1
604 ; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
605 ; X64-AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
606 ; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
607 ; X64-AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm0
608 ; X64-AVX2-NEXT: retq
610 ; X64-AVX512DQ-LABEL: mul_v16i8_neg5:
611 ; X64-AVX512DQ: # %bb.0:
612 ; X64-AVX512DQ-NEXT: vpsllw $2, %xmm0, %xmm1
613 ; X64-AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
614 ; X64-AVX512DQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0
615 ; X64-AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
616 ; X64-AVX512DQ-NEXT: vpsubb %xmm0, %xmm1, %xmm0
617 ; X64-AVX512DQ-NEXT: retq
618 %1 = mul <16 x i8> %a0, <i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5>
622 define <4 x i64> @mul_v4i64_neg1025(<4 x i64> %a0) nounwind {
623 ; SSE-LABEL: mul_v4i64_neg1025:
625 ; SSE-NEXT: movdqa %xmm0, %xmm3
626 ; SSE-NEXT: psllq $10, %xmm3
627 ; SSE-NEXT: paddq %xmm0, %xmm3
628 ; SSE-NEXT: pxor %xmm2, %xmm2
629 ; SSE-NEXT: pxor %xmm0, %xmm0
630 ; SSE-NEXT: psubq %xmm3, %xmm0
631 ; SSE-NEXT: movdqa %xmm1, %xmm3
632 ; SSE-NEXT: psllq $10, %xmm3
633 ; SSE-NEXT: paddq %xmm1, %xmm3
634 ; SSE-NEXT: psubq %xmm3, %xmm2
635 ; SSE-NEXT: movdqa %xmm2, %xmm1
636 ; SSE-NEXT: ret{{[l|q]}}
638 ; X64-XOP-LABEL: mul_v4i64_neg1025:
640 ; X64-XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
641 ; X64-XOP-NEXT: vpsllq $10, %xmm1, %xmm2
642 ; X64-XOP-NEXT: vpaddq %xmm1, %xmm2, %xmm1
643 ; X64-XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
644 ; X64-XOP-NEXT: vpsubq %xmm1, %xmm2, %xmm1
645 ; X64-XOP-NEXT: vpsllq $10, %xmm0, %xmm3
646 ; X64-XOP-NEXT: vpaddq %xmm0, %xmm3, %xmm0
647 ; X64-XOP-NEXT: vpsubq %xmm0, %xmm2, %xmm0
648 ; X64-XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
651 ; X64-AVX2-LABEL: mul_v4i64_neg1025:
653 ; X64-AVX2-NEXT: vpsllq $10, %ymm0, %ymm1
654 ; X64-AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0
655 ; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
656 ; X64-AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm0
657 ; X64-AVX2-NEXT: retq
659 ; X64-AVX512DQ-LABEL: mul_v4i64_neg1025:
660 ; X64-AVX512DQ: # %bb.0:
661 ; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
662 ; X64-AVX512DQ-NEXT: retq
663 %1 = mul <4 x i64> %a0, <i64 -1025, i64 -1025, i64 -1025, i64 -1025>
667 define <8 x i32> @mul_v8i32_neg33(<8 x i32> %a0) nounwind {
668 ; SSE-LABEL: mul_v8i32_neg33:
670 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [4294967263,4294967263,4294967263,4294967263]
671 ; SSE-NEXT: pmulld %xmm2, %xmm0
672 ; SSE-NEXT: pmulld %xmm2, %xmm1
673 ; SSE-NEXT: ret{{[l|q]}}
675 ; X64-XOP-LABEL: mul_v8i32_neg33:
677 ; X64-XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
678 ; X64-XOP-NEXT: vpslld $5, %xmm1, %xmm2
679 ; X64-XOP-NEXT: vpaddd %xmm1, %xmm2, %xmm1
680 ; X64-XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
681 ; X64-XOP-NEXT: vpsubd %xmm1, %xmm2, %xmm1
682 ; X64-XOP-NEXT: vpslld $5, %xmm0, %xmm3
683 ; X64-XOP-NEXT: vpaddd %xmm0, %xmm3, %xmm0
684 ; X64-XOP-NEXT: vpsubd %xmm0, %xmm2, %xmm0
685 ; X64-XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
688 ; X64-AVX2-LABEL: mul_v8i32_neg33:
690 ; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967263,4294967263,4294967263,4294967263,4294967263,4294967263,4294967263,4294967263]
691 ; X64-AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
692 ; X64-AVX2-NEXT: retq
694 ; X64-AVX512DQ-LABEL: mul_v8i32_neg33:
695 ; X64-AVX512DQ: # %bb.0:
696 ; X64-AVX512DQ-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
697 ; X64-AVX512DQ-NEXT: retq
698 %1 = mul <8 x i32> %a0, <i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 -33>
702 define <16 x i16> @mul_v16i16_neg9(<16 x i16> %a0) nounwind {
703 ; SSE-LABEL: mul_v16i16_neg9:
705 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65527,65527,65527,65527,65527,65527,65527,65527]
706 ; SSE-NEXT: pmullw %xmm2, %xmm0
707 ; SSE-NEXT: pmullw %xmm2, %xmm1
708 ; SSE-NEXT: ret{{[l|q]}}
710 ; X64-XOP-LABEL: mul_v16i16_neg9:
712 ; X64-XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
713 ; X64-XOP-NEXT: vpsllw $3, %xmm1, %xmm2
714 ; X64-XOP-NEXT: vpaddw %xmm1, %xmm2, %xmm1
715 ; X64-XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
716 ; X64-XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1
717 ; X64-XOP-NEXT: vpsllw $3, %xmm0, %xmm3
718 ; X64-XOP-NEXT: vpaddw %xmm0, %xmm3, %xmm0
719 ; X64-XOP-NEXT: vpsubw %xmm0, %xmm2, %xmm0
720 ; X64-XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
723 ; X64-AVX2-LABEL: mul_v16i16_neg9:
725 ; X64-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
726 ; X64-AVX2-NEXT: retq
728 ; X64-AVX512DQ-LABEL: mul_v16i16_neg9:
729 ; X64-AVX512DQ: # %bb.0:
730 ; X64-AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
731 ; X64-AVX512DQ-NEXT: retq
732 %1 = mul <16 x i16> %a0, <i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9>
736 define <32 x i8> @mul_v32i8_neg5(<32 x i8> %a0) nounwind {
737 ; SSE-LABEL: mul_v32i8_neg5:
739 ; SSE-NEXT: movdqa %xmm0, %xmm3
740 ; SSE-NEXT: psllw $2, %xmm3
741 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
742 ; SSE-NEXT: pand %xmm4, %xmm3
743 ; SSE-NEXT: paddb %xmm0, %xmm3
744 ; SSE-NEXT: pxor %xmm2, %xmm2
745 ; SSE-NEXT: pxor %xmm0, %xmm0
746 ; SSE-NEXT: psubb %xmm3, %xmm0
747 ; SSE-NEXT: movdqa %xmm1, %xmm3
748 ; SSE-NEXT: psllw $2, %xmm3
749 ; SSE-NEXT: pand %xmm4, %xmm3
750 ; SSE-NEXT: paddb %xmm1, %xmm3
751 ; SSE-NEXT: psubb %xmm3, %xmm2
752 ; SSE-NEXT: movdqa %xmm2, %xmm1
753 ; SSE-NEXT: ret{{[l|q]}}
755 ; X64-XOP-LABEL: mul_v32i8_neg5:
757 ; X64-XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
758 ; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
759 ; X64-XOP-NEXT: vpshlb %xmm2, %xmm1, %xmm3
760 ; X64-XOP-NEXT: vpaddb %xmm1, %xmm3, %xmm1
761 ; X64-XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3
762 ; X64-XOP-NEXT: vpsubb %xmm1, %xmm3, %xmm1
763 ; X64-XOP-NEXT: vpshlb %xmm2, %xmm0, %xmm2
764 ; X64-XOP-NEXT: vpaddb %xmm0, %xmm2, %xmm0
765 ; X64-XOP-NEXT: vpsubb %xmm0, %xmm3, %xmm0
766 ; X64-XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
769 ; X64-AVX2-LABEL: mul_v32i8_neg5:
771 ; X64-AVX2-NEXT: vpsllw $2, %ymm0, %ymm1
772 ; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
773 ; X64-AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
774 ; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
775 ; X64-AVX2-NEXT: vpsubb %ymm0, %ymm1, %ymm0
776 ; X64-AVX2-NEXT: retq
778 ; X64-AVX512DQ-LABEL: mul_v32i8_neg5:
779 ; X64-AVX512DQ: # %bb.0:
780 ; X64-AVX512DQ-NEXT: vpsllw $2, %ymm0, %ymm1
781 ; X64-AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
782 ; X64-AVX512DQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0
783 ; X64-AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
784 ; X64-AVX512DQ-NEXT: vpsubb %ymm0, %ymm1, %ymm0
785 ; X64-AVX512DQ-NEXT: retq
786 %1 = mul <32 x i8> %a0, <i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5>
791 ; PowOf2 + 1 (non-uniform)
794 define <2 x i64> @mul_v2i64_17_65(<2 x i64> %a0) nounwind {
795 ; X86-SSE-LABEL: mul_v2i64_17_65:
797 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [17,0,65,0]
798 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
799 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm2
800 ; X86-SSE-NEXT: psrlq $32, %xmm0
801 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm0
802 ; X86-SSE-NEXT: psllq $32, %xmm0
803 ; X86-SSE-NEXT: paddq %xmm2, %xmm0
806 ; X64-SSE-LABEL: mul_v2i64_17_65:
808 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [17,65]
809 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2
810 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm2
811 ; X64-SSE-NEXT: psrlq $32, %xmm0
812 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm0
813 ; X64-SSE-NEXT: psllq $32, %xmm0
814 ; X64-SSE-NEXT: paddq %xmm2, %xmm0
817 ; X64-XOP-LABEL: mul_v2i64_17_65:
819 ; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [17,65]
820 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
821 ; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm0
822 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
823 ; X64-XOP-NEXT: vpsllq $32, %xmm0, %xmm0
824 ; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
827 ; X64-AVX2-LABEL: mul_v2i64_17_65:
829 ; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [17,65]
830 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
831 ; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0
832 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
833 ; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
834 ; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0
835 ; X64-AVX2-NEXT: retq
837 ; X64-AVX512DQ-LABEL: mul_v2i64_17_65:
838 ; X64-AVX512DQ: # %bb.0:
839 ; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
840 ; X64-AVX512DQ-NEXT: retq
841 %1 = mul <2 x i64> %a0, <i64 17, i64 65>
845 define <4 x i32> @mul_v4i32_5_17_33_65(<4 x i32> %a0) nounwind {
846 ; X86-SSE-LABEL: mul_v4i32_5_17_33_65:
848 ; X86-SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
851 ; X64-SSE-LABEL: mul_v4i32_5_17_33_65:
853 ; X64-SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
856 ; X64-AVX-LABEL: mul_v4i32_5_17_33_65:
858 ; X64-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
860 %1 = mul <4 x i32> %a0, <i32 5, i32 17, i32 33, i32 65>
864 define <8 x i16> @mul_v8i16_2_3_9_17_33_65_129_257(<8 x i16> %a0) nounwind {
865 ; X86-SSE-LABEL: mul_v8i16_2_3_9_17_33_65_129_257:
867 ; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
870 ; X64-SSE-LABEL: mul_v8i16_2_3_9_17_33_65_129_257:
872 ; X64-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
875 ; X64-AVX-LABEL: mul_v8i16_2_3_9_17_33_65_129_257:
877 ; X64-AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
879 %1 = mul <8 x i16> %a0, <i16 2, i16 3, i16 9, i16 17, i16 33, i16 65, i16 129, i16 257>
883 define <16 x i8> @mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3(<16 x i8> %a0) nounwind {
884 ; X86-SSE-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
886 ; X86-SSE-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
887 ; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
888 ; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
889 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
890 ; X86-SSE-NEXT: pand %xmm2, %xmm0
891 ; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
892 ; X86-SSE-NEXT: pand %xmm2, %xmm1
893 ; X86-SSE-NEXT: packuswb %xmm0, %xmm1
894 ; X86-SSE-NEXT: movdqa %xmm1, %xmm0
897 ; X64-SSE-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
899 ; X64-SSE-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
900 ; X64-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
901 ; X64-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
902 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
903 ; X64-SSE-NEXT: pand %xmm2, %xmm0
904 ; X64-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
905 ; X64-SSE-NEXT: pand %xmm2, %xmm1
906 ; X64-SSE-NEXT: packuswb %xmm0, %xmm1
907 ; X64-SSE-NEXT: movdqa %xmm1, %xmm0
910 ; X64-XOP-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
912 ; X64-XOP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
913 ; X64-XOP-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
914 ; X64-XOP-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
915 ; X64-XOP-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
916 ; X64-XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],xmm1[0,2,4,6,8,10,12,14]
919 ; X64-AVX2-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
921 ; X64-AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
922 ; X64-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
923 ; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
924 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
925 ; X64-AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
926 ; X64-AVX2-NEXT: vzeroupper
927 ; X64-AVX2-NEXT: retq
929 ; X64-AVX512DQ-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
930 ; X64-AVX512DQ: # %bb.0:
931 ; X64-AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
932 ; X64-AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
933 ; X64-AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
934 ; X64-AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
935 ; X64-AVX512DQ-NEXT: vzeroupper
936 ; X64-AVX512DQ-NEXT: retq
937 %1 = mul <16 x i8> %a0, <i8 2, i8 3, i8 9, i8 17, i8 33, i8 65, i8 129, i8 2, i8 3, i8 9, i8 17, i8 33, i8 65, i8 129, i8 2, i8 3>
942 ; PowOf2 - 1 (uniform)
945 define <2 x i64> @mul_v2i64_7(<2 x i64> %a0) nounwind {
946 ; SSE-LABEL: mul_v2i64_7:
948 ; SSE-NEXT: movdqa %xmm0, %xmm1
949 ; SSE-NEXT: psllq $3, %xmm1
950 ; SSE-NEXT: psubq %xmm0, %xmm1
951 ; SSE-NEXT: movdqa %xmm1, %xmm0
952 ; SSE-NEXT: ret{{[l|q]}}
954 ; X64-XOP-LABEL: mul_v2i64_7:
956 ; X64-XOP-NEXT: vpsllq $3, %xmm0, %xmm1
957 ; X64-XOP-NEXT: vpsubq %xmm0, %xmm1, %xmm0
960 ; X64-AVX2-LABEL: mul_v2i64_7:
962 ; X64-AVX2-NEXT: vpsllq $3, %xmm0, %xmm1
963 ; X64-AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm0
964 ; X64-AVX2-NEXT: retq
966 ; X64-AVX512DQ-LABEL: mul_v2i64_7:
967 ; X64-AVX512DQ: # %bb.0:
968 ; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
969 ; X64-AVX512DQ-NEXT: retq
970 %1 = mul <2 x i64> %a0, <i64 7, i64 7>
974 define <4 x i32> @mul_v4i32_7(<4 x i32> %a0) nounwind {
975 ; X86-SSE-LABEL: mul_v4i32_7:
977 ; X86-SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
980 ; X64-SSE-LABEL: mul_v4i32_7:
982 ; X64-SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
985 ; X64-XOP-LABEL: mul_v4i32_7:
987 ; X64-XOP-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
990 ; X64-AVX2-LABEL: mul_v4i32_7:
992 ; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7]
993 ; X64-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
994 ; X64-AVX2-NEXT: retq
996 ; X64-AVX512DQ-LABEL: mul_v4i32_7:
997 ; X64-AVX512DQ: # %bb.0:
998 ; X64-AVX512DQ-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
999 ; X64-AVX512DQ-NEXT: retq
1000 %1 = mul <4 x i32> %a0, <i32 7, i32 7, i32 7, i32 7>
1004 define <8 x i16> @mul_v8i16_7(<8 x i16> %a0) nounwind {
1005 ; X86-SSE-LABEL: mul_v8i16_7:
1007 ; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1008 ; X86-SSE-NEXT: retl
1010 ; X64-SSE-LABEL: mul_v8i16_7:
1012 ; X64-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1013 ; X64-SSE-NEXT: retq
1015 ; X64-AVX-LABEL: mul_v8i16_7:
1017 ; X64-AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1018 ; X64-AVX-NEXT: retq
1019 %1 = mul <8 x i16> %a0, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
1023 define <16 x i8> @mul_v16i8_31(<16 x i8> %a0) nounwind {
1024 ; X86-SSE-LABEL: mul_v16i8_31:
1026 ; X86-SSE-NEXT: movdqa %xmm0, %xmm1
1027 ; X86-SSE-NEXT: psllw $5, %xmm1
1028 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1029 ; X86-SSE-NEXT: psubb %xmm0, %xmm1
1030 ; X86-SSE-NEXT: movdqa %xmm1, %xmm0
1031 ; X86-SSE-NEXT: retl
1033 ; X64-SSE-LABEL: mul_v16i8_31:
1035 ; X64-SSE-NEXT: movdqa %xmm0, %xmm1
1036 ; X64-SSE-NEXT: psllw $5, %xmm1
1037 ; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1038 ; X64-SSE-NEXT: psubb %xmm0, %xmm1
1039 ; X64-SSE-NEXT: movdqa %xmm1, %xmm0
1040 ; X64-SSE-NEXT: retq
1042 ; X64-XOP-LABEL: mul_v16i8_31:
1044 ; X64-XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1045 ; X64-XOP-NEXT: vpsubb %xmm0, %xmm1, %xmm0
1046 ; X64-XOP-NEXT: retq
1048 ; X64-AVX2-LABEL: mul_v16i8_31:
1049 ; X64-AVX2: # %bb.0:
1050 ; X64-AVX2-NEXT: vpsllw $5, %xmm0, %xmm1
1051 ; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1052 ; X64-AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm0
1053 ; X64-AVX2-NEXT: retq
1055 ; X64-AVX512DQ-LABEL: mul_v16i8_31:
1056 ; X64-AVX512DQ: # %bb.0:
1057 ; X64-AVX512DQ-NEXT: vpsllw $5, %xmm0, %xmm1
1058 ; X64-AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1059 ; X64-AVX512DQ-NEXT: vpsubb %xmm0, %xmm1, %xmm0
1060 ; X64-AVX512DQ-NEXT: retq
1061 %1 = mul <16 x i8> %a0, <i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31>
1066 ; -(PowOf2 - 1) (uniform)
1069 define <2 x i64> @mul_v2i64_neg7(<2 x i64> %a0) nounwind {
1070 ; SSE-LABEL: mul_v2i64_neg7:
1072 ; SSE-NEXT: movdqa %xmm0, %xmm1
1073 ; SSE-NEXT: psllq $3, %xmm1
1074 ; SSE-NEXT: psubq %xmm1, %xmm0
1075 ; SSE-NEXT: ret{{[l|q]}}
1077 ; X64-XOP-LABEL: mul_v2i64_neg7:
1079 ; X64-XOP-NEXT: vpsllq $3, %xmm0, %xmm1
1080 ; X64-XOP-NEXT: vpsubq %xmm1, %xmm0, %xmm0
1081 ; X64-XOP-NEXT: retq
1083 ; X64-AVX2-LABEL: mul_v2i64_neg7:
1084 ; X64-AVX2: # %bb.0:
1085 ; X64-AVX2-NEXT: vpsllq $3, %xmm0, %xmm1
1086 ; X64-AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0
1087 ; X64-AVX2-NEXT: retq
1089 ; X64-AVX512DQ-LABEL: mul_v2i64_neg7:
1090 ; X64-AVX512DQ: # %bb.0:
1091 ; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1092 ; X64-AVX512DQ-NEXT: retq
1093 %1 = mul <2 x i64> %a0, <i64 -7, i64 -7>
1097 define <4 x i32> @mul_v4i32_neg63(<4 x i32> %a0) nounwind {
1098 ; X86-SSE-LABEL: mul_v4i32_neg63:
1100 ; X86-SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1101 ; X86-SSE-NEXT: retl
1103 ; X64-SSE-LABEL: mul_v4i32_neg63:
1105 ; X64-SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1106 ; X64-SSE-NEXT: retq
1108 ; X64-XOP-LABEL: mul_v4i32_neg63:
1110 ; X64-XOP-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1111 ; X64-XOP-NEXT: retq
1113 ; X64-AVX2-LABEL: mul_v4i32_neg63:
1114 ; X64-AVX2: # %bb.0:
1115 ; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967233,4294967233,4294967233,4294967233]
1116 ; X64-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1117 ; X64-AVX2-NEXT: retq
1119 ; X64-AVX512DQ-LABEL: mul_v4i32_neg63:
1120 ; X64-AVX512DQ: # %bb.0:
1121 ; X64-AVX512DQ-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
1122 ; X64-AVX512DQ-NEXT: retq
1123 %1 = mul <4 x i32> %a0, <i32 -63, i32 -63, i32 -63, i32 -63>
1127 define <8 x i16> @mul_v8i16_neg31(<8 x i16> %a0) nounwind {
1128 ; X86-SSE-LABEL: mul_v8i16_neg31:
1130 ; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1131 ; X86-SSE-NEXT: retl
1133 ; X64-SSE-LABEL: mul_v8i16_neg31:
1135 ; X64-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1136 ; X64-SSE-NEXT: retq
1138 ; X64-AVX-LABEL: mul_v8i16_neg31:
1140 ; X64-AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1141 ; X64-AVX-NEXT: retq
1142 %1 = mul <8 x i16> %a0, <i16 -31, i16 -31, i16 -31, i16 -31, i16 -31, i16 -31, i16 -31, i16 -31>
1146 define <16 x i8> @mul_v16i8_neg15(<16 x i8> %a0) nounwind {
1147 ; X86-SSE-LABEL: mul_v16i8_neg15:
1149 ; X86-SSE-NEXT: movdqa %xmm0, %xmm1
1150 ; X86-SSE-NEXT: psllw $4, %xmm1
1151 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1152 ; X86-SSE-NEXT: psubb %xmm1, %xmm0
1153 ; X86-SSE-NEXT: retl
1155 ; X64-SSE-LABEL: mul_v16i8_neg15:
1157 ; X64-SSE-NEXT: movdqa %xmm0, %xmm1
1158 ; X64-SSE-NEXT: psllw $4, %xmm1
1159 ; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1160 ; X64-SSE-NEXT: psubb %xmm1, %xmm0
1161 ; X64-SSE-NEXT: retq
1163 ; X64-XOP-LABEL: mul_v16i8_neg15:
1165 ; X64-XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1166 ; X64-XOP-NEXT: vpsubb %xmm1, %xmm0, %xmm0
1167 ; X64-XOP-NEXT: retq
1169 ; X64-AVX2-LABEL: mul_v16i8_neg15:
1170 ; X64-AVX2: # %bb.0:
1171 ; X64-AVX2-NEXT: vpsllw $4, %xmm0, %xmm1
1172 ; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1173 ; X64-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
1174 ; X64-AVX2-NEXT: retq
1176 ; X64-AVX512DQ-LABEL: mul_v16i8_neg15:
1177 ; X64-AVX512DQ: # %bb.0:
1178 ; X64-AVX512DQ-NEXT: vpsllw $4, %xmm0, %xmm1
1179 ; X64-AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1180 ; X64-AVX512DQ-NEXT: vpsubb %xmm1, %xmm0, %xmm0
1181 ; X64-AVX512DQ-NEXT: retq
1182 %1 = mul <16 x i8> %a0, <i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15>
1187 ; PowOf2 - 1 (non-uniform)
1190 define <2 x i64> @mul_v2i64_15_63(<2 x i64> %a0) nounwind {
1191 ; X86-SSE-LABEL: mul_v2i64_15_63:
1193 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,0,63,0]
1194 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1195 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm2
1196 ; X86-SSE-NEXT: psrlq $32, %xmm0
1197 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm0
1198 ; X86-SSE-NEXT: psllq $32, %xmm0
1199 ; X86-SSE-NEXT: paddq %xmm2, %xmm0
1200 ; X86-SSE-NEXT: retl
1202 ; X64-SSE-LABEL: mul_v2i64_15_63:
1204 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,63]
1205 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2
1206 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm2
1207 ; X64-SSE-NEXT: psrlq $32, %xmm0
1208 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm0
1209 ; X64-SSE-NEXT: psllq $32, %xmm0
1210 ; X64-SSE-NEXT: paddq %xmm2, %xmm0
1211 ; X64-SSE-NEXT: retq
1213 ; X64-XOP-LABEL: mul_v2i64_15_63:
1215 ; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [15,63]
1216 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
1217 ; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm0
1218 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
1219 ; X64-XOP-NEXT: vpsllq $32, %xmm0, %xmm0
1220 ; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1221 ; X64-XOP-NEXT: retq
1223 ; X64-AVX2-LABEL: mul_v2i64_15_63:
1224 ; X64-AVX2: # %bb.0:
1225 ; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,63]
1226 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
1227 ; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0
1228 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
1229 ; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
1230 ; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1231 ; X64-AVX2-NEXT: retq
1233 ; X64-AVX512DQ-LABEL: mul_v2i64_15_63:
1234 ; X64-AVX512DQ: # %bb.0:
1235 ; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1236 ; X64-AVX512DQ-NEXT: retq
1237 %1 = mul <2 x i64> %a0, <i64 15, i64 63>
1241 define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind {
1242 ; X86-SSE-LABEL: mul_v2i64_neg_15_63:
1244 ; X86-SSE-NEXT: pcmpeqd %xmm1, %xmm1
1245 ; X86-SSE-NEXT: pmuludq %xmm0, %xmm1
1246 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1247 ; X86-SSE-NEXT: psrlq $32, %xmm2
1248 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4294967281,4294967295,4294967233,4294967295]
1249 ; X86-SSE-NEXT: pmuludq %xmm3, %xmm2
1250 ; X86-SSE-NEXT: paddq %xmm1, %xmm2
1251 ; X86-SSE-NEXT: psllq $32, %xmm2
1252 ; X86-SSE-NEXT: pmuludq %xmm3, %xmm0
1253 ; X86-SSE-NEXT: paddq %xmm2, %xmm0
1254 ; X86-SSE-NEXT: retl
1256 ; X64-SSE-LABEL: mul_v2i64_neg_15_63:
1258 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553]
1259 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2
1260 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm2
1261 ; X64-SSE-NEXT: movdqa %xmm0, %xmm3
1262 ; X64-SSE-NEXT: psrlq $32, %xmm3
1263 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm3
1264 ; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1265 ; X64-SSE-NEXT: paddq %xmm3, %xmm0
1266 ; X64-SSE-NEXT: psllq $32, %xmm0
1267 ; X64-SSE-NEXT: paddq %xmm2, %xmm0
1268 ; X64-SSE-NEXT: retq
1270 ; X64-XOP-LABEL: mul_v2i64_neg_15_63:
1272 ; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553]
1273 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
1274 ; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm3
1275 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
1276 ; X64-XOP-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1277 ; X64-XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0
1278 ; X64-XOP-NEXT: vpsllq $32, %xmm0, %xmm0
1279 ; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1280 ; X64-XOP-NEXT: retq
1282 ; X64-AVX2-LABEL: mul_v2i64_neg_15_63:
1283 ; X64-AVX2: # %bb.0:
1284 ; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553]
1285 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
1286 ; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3
1287 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
1288 ; X64-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1289 ; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
1290 ; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
1291 ; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1292 ; X64-AVX2-NEXT: retq
1294 ; X64-AVX512DQ-LABEL: mul_v2i64_neg_15_63:
1295 ; X64-AVX512DQ: # %bb.0:
1296 ; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1297 ; X64-AVX512DQ-NEXT: retq
1298 %1 = mul <2 x i64> %a0, <i64 -15, i64 -63>
1302 define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind {
1303 ; X86-SSE-LABEL: mul_v2i64_neg_17_65:
1305 ; X86-SSE-NEXT: pcmpeqd %xmm1, %xmm1
1306 ; X86-SSE-NEXT: pmuludq %xmm0, %xmm1
1307 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1308 ; X86-SSE-NEXT: psrlq $32, %xmm2
1309 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4294967279,4294967295,4294967231,4294967295]
1310 ; X86-SSE-NEXT: pmuludq %xmm3, %xmm2
1311 ; X86-SSE-NEXT: paddq %xmm1, %xmm2
1312 ; X86-SSE-NEXT: psllq $32, %xmm2
1313 ; X86-SSE-NEXT: pmuludq %xmm3, %xmm0
1314 ; X86-SSE-NEXT: paddq %xmm2, %xmm0
1315 ; X86-SSE-NEXT: retl
1317 ; X64-SSE-LABEL: mul_v2i64_neg_17_65:
1319 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551]
1320 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2
1321 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm2
1322 ; X64-SSE-NEXT: movdqa %xmm0, %xmm3
1323 ; X64-SSE-NEXT: psrlq $32, %xmm3
1324 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm3
1325 ; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1326 ; X64-SSE-NEXT: paddq %xmm3, %xmm0
1327 ; X64-SSE-NEXT: psllq $32, %xmm0
1328 ; X64-SSE-NEXT: paddq %xmm2, %xmm0
1329 ; X64-SSE-NEXT: retq
1331 ; X64-XOP-LABEL: mul_v2i64_neg_17_65:
1333 ; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551]
1334 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
1335 ; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm3
1336 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
1337 ; X64-XOP-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1338 ; X64-XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0
1339 ; X64-XOP-NEXT: vpsllq $32, %xmm0, %xmm0
1340 ; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1341 ; X64-XOP-NEXT: retq
1343 ; X64-AVX2-LABEL: mul_v2i64_neg_17_65:
1344 ; X64-AVX2: # %bb.0:
1345 ; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551]
1346 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
1347 ; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3
1348 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
1349 ; X64-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1350 ; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
1351 ; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
1352 ; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1353 ; X64-AVX2-NEXT: retq
1355 ; X64-AVX512DQ-LABEL: mul_v2i64_neg_17_65:
1356 ; X64-AVX512DQ: # %bb.0:
1357 ; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1358 ; X64-AVX512DQ-NEXT: retq
1359 %1 = mul <2 x i64> %a0, <i64 -17, i64 -65>
1363 define <2 x i64> @mul_v2i64_0_1(<2 x i64> %a0) nounwind {
1364 ; SSE-LABEL: mul_v2i64_0_1:
1366 ; SSE-NEXT: xorps %xmm1, %xmm1
1367 ; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1368 ; SSE-NEXT: ret{{[l|q]}}
1370 ; X64-AVX-LABEL: mul_v2i64_0_1:
1372 ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1373 ; X64-AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1374 ; X64-AVX-NEXT: retq
1375 %1 = mul <2 x i64> %a0, <i64 0, i64 1>
1379 define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind {
1380 ; X86-SSE-LABEL: mul_v2i64_neg_0_1:
1382 ; X86-SSE-NEXT: movdqa %xmm0, %xmm1
1383 ; X86-SSE-NEXT: psrlq $32, %xmm1
1384 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,0,4294967295,4294967295]
1385 ; X86-SSE-NEXT: pmuludq %xmm2, %xmm1
1386 ; X86-SSE-NEXT: movdqa %xmm2, %xmm3
1387 ; X86-SSE-NEXT: psrlq $32, %xmm3
1388 ; X86-SSE-NEXT: pmuludq %xmm0, %xmm3
1389 ; X86-SSE-NEXT: paddq %xmm1, %xmm3
1390 ; X86-SSE-NEXT: psllq $32, %xmm3
1391 ; X86-SSE-NEXT: pmuludq %xmm2, %xmm0
1392 ; X86-SSE-NEXT: paddq %xmm3, %xmm0
1393 ; X86-SSE-NEXT: retl
1395 ; X64-SSE-LABEL: mul_v2i64_neg_0_1:
1397 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255]
1398 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2
1399 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm2
1400 ; X64-SSE-NEXT: movdqa %xmm0, %xmm3
1401 ; X64-SSE-NEXT: psrlq $32, %xmm3
1402 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm3
1403 ; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1404 ; X64-SSE-NEXT: paddq %xmm3, %xmm0
1405 ; X64-SSE-NEXT: psllq $32, %xmm0
1406 ; X64-SSE-NEXT: paddq %xmm2, %xmm0
1407 ; X64-SSE-NEXT: retq
1409 ; X64-XOP-LABEL: mul_v2i64_neg_0_1:
1411 ; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255]
1412 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
1413 ; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm3
1414 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
1415 ; X64-XOP-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1416 ; X64-XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0
1417 ; X64-XOP-NEXT: vpsllq $32, %xmm0, %xmm0
1418 ; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1419 ; X64-XOP-NEXT: retq
1421 ; X64-AVX2-LABEL: mul_v2i64_neg_0_1:
1422 ; X64-AVX2: # %bb.0:
1423 ; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255]
1424 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
1425 ; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3
1426 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
1427 ; X64-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1428 ; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
1429 ; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
1430 ; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1431 ; X64-AVX2-NEXT: retq
1433 ; X64-AVX512DQ-LABEL: mul_v2i64_neg_0_1:
1434 ; X64-AVX512DQ: # %bb.0:
1435 ; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1436 ; X64-AVX512DQ-NEXT: retq
1437 %1 = mul <2 x i64> %a0, <i64 0, i64 -1>
1441 define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind {
1442 ; X86-SSE-LABEL: mul_v2i64_15_neg_63:
1444 ; X86-SSE-NEXT: movdqa %xmm0, %xmm1
1445 ; X86-SSE-NEXT: psrlq $32, %xmm1
1446 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,0,4294967233,4294967295]
1447 ; X86-SSE-NEXT: pmuludq %xmm2, %xmm1
1448 ; X86-SSE-NEXT: movdqa %xmm2, %xmm3
1449 ; X86-SSE-NEXT: psrlq $32, %xmm3
1450 ; X86-SSE-NEXT: pmuludq %xmm0, %xmm3
1451 ; X86-SSE-NEXT: paddq %xmm1, %xmm3
1452 ; X86-SSE-NEXT: psllq $32, %xmm3
1453 ; X86-SSE-NEXT: pmuludq %xmm2, %xmm0
1454 ; X86-SSE-NEXT: paddq %xmm3, %xmm0
1455 ; X86-SSE-NEXT: retl
1457 ; X64-SSE-LABEL: mul_v2i64_15_neg_63:
1459 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,18446744073709551553]
1460 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2
1461 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm2
1462 ; X64-SSE-NEXT: movdqa %xmm0, %xmm3
1463 ; X64-SSE-NEXT: psrlq $32, %xmm3
1464 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm3
1465 ; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1466 ; X64-SSE-NEXT: paddq %xmm3, %xmm0
1467 ; X64-SSE-NEXT: psllq $32, %xmm0
1468 ; X64-SSE-NEXT: paddq %xmm2, %xmm0
1469 ; X64-SSE-NEXT: retq
1471 ; X64-XOP-LABEL: mul_v2i64_15_neg_63:
1473 ; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [15,18446744073709551553]
1474 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
1475 ; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm3
1476 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
1477 ; X64-XOP-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1478 ; X64-XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0
1479 ; X64-XOP-NEXT: vpsllq $32, %xmm0, %xmm0
1480 ; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1481 ; X64-XOP-NEXT: retq
1483 ; X64-AVX2-LABEL: mul_v2i64_15_neg_63:
1484 ; X64-AVX2: # %bb.0:
1485 ; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,18446744073709551553]
1486 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
1487 ; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3
1488 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
1489 ; X64-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1490 ; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
1491 ; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
1492 ; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1493 ; X64-AVX2-NEXT: retq
1495 ; X64-AVX512DQ-LABEL: mul_v2i64_15_neg_63:
1496 ; X64-AVX512DQ: # %bb.0:
1497 ; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1498 ; X64-AVX512DQ-NEXT: retq
1499 %1 = mul <2 x i64> %a0, <i64 15, i64 -63>
1503 define <4 x i32> @mul_v4i32_0_15_31_7(<4 x i32> %a0) nounwind {
1504 ; X86-SSE-LABEL: mul_v4i32_0_15_31_7:
1506 ; X86-SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1507 ; X86-SSE-NEXT: retl
1509 ; X64-SSE-LABEL: mul_v4i32_0_15_31_7:
1511 ; X64-SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1512 ; X64-SSE-NEXT: retq
1514 ; X64-AVX-LABEL: mul_v4i32_0_15_31_7:
1516 ; X64-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1517 ; X64-AVX-NEXT: retq
1518 %1 = mul <4 x i32> %a0, <i32 0, i32 15, i32 31, i32 7>
1522 define <8 x i16> @mul_v8i16_0_1_7_15_31_63_127_255(<8 x i16> %a0) nounwind {
1523 ; X86-SSE-LABEL: mul_v8i16_0_1_7_15_31_63_127_255:
1525 ; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1526 ; X86-SSE-NEXT: retl
1528 ; X64-SSE-LABEL: mul_v8i16_0_1_7_15_31_63_127_255:
1530 ; X64-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1531 ; X64-SSE-NEXT: retq
1533 ; X64-AVX-LABEL: mul_v8i16_0_1_7_15_31_63_127_255:
1535 ; X64-AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1536 ; X64-AVX-NEXT: retq
1537 %1 = mul <8 x i16> %a0, <i16 0, i16 1, i16 7, i16 15, i16 31, i16 63, i16 127, i16 255>
1541 define <16 x i8> @mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127(<16 x i8> %a0) nounwind {
1542 ; SSE-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127:
1544 ; SSE-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1545 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1546 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,1,3,7,15,31,63,127]
1547 ; SSE-NEXT: pmullw %xmm2, %xmm0
1548 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
1549 ; SSE-NEXT: pand %xmm3, %xmm0
1550 ; SSE-NEXT: pmullw %xmm2, %xmm1
1551 ; SSE-NEXT: pand %xmm3, %xmm1
1552 ; SSE-NEXT: packuswb %xmm0, %xmm1
1553 ; SSE-NEXT: movdqa %xmm1, %xmm0
1554 ; SSE-NEXT: ret{{[l|q]}}
1556 ; X64-XOP-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127:
1558 ; X64-XOP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1559 ; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,3,7,15,31,63,127]
1560 ; X64-XOP-NEXT: vpmullw %xmm2, %xmm1, %xmm1
1561 ; X64-XOP-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1562 ; X64-XOP-NEXT: vpmullw %xmm2, %xmm0, %xmm0
1563 ; X64-XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],xmm1[0,2,4,6,8,10,12,14]
1564 ; X64-XOP-NEXT: retq
1566 ; X64-AVX2-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127:
1567 ; X64-AVX2: # %bb.0:
1568 ; X64-AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1569 ; X64-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1570 ; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1571 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1572 ; X64-AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1573 ; X64-AVX2-NEXT: vzeroupper
1574 ; X64-AVX2-NEXT: retq
1576 ; X64-AVX512DQ-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127:
1577 ; X64-AVX512DQ: # %bb.0:
1578 ; X64-AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1579 ; X64-AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1580 ; X64-AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1581 ; X64-AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
1582 ; X64-AVX512DQ-NEXT: vzeroupper
1583 ; X64-AVX512DQ-NEXT: retq
1584 %1 = mul <16 x i8> %a0, <i8 0, i8 1, i8 3, i8 7, i8 15, i8 31, i8 63, i8 127, i8 0, i8 1, i8 3, i8 7, i8 15, i8 31, i8 63, i8 127>
1588 define <2 x i64> @mul_v2i64_68_132(<2 x i64> %x) nounwind {
1589 ; X86-SSE-LABEL: mul_v2i64_68_132:
1591 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [68,0,132,0]
1592 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1593 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm2
1594 ; X86-SSE-NEXT: psrlq $32, %xmm0
1595 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm0
1596 ; X86-SSE-NEXT: psllq $32, %xmm0
1597 ; X86-SSE-NEXT: paddq %xmm2, %xmm0
1598 ; X86-SSE-NEXT: retl
1600 ; X64-SSE-LABEL: mul_v2i64_68_132:
1602 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [68,132]
1603 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2
1604 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm2
1605 ; X64-SSE-NEXT: psrlq $32, %xmm0
1606 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm0
1607 ; X64-SSE-NEXT: psllq $32, %xmm0
1608 ; X64-SSE-NEXT: paddq %xmm2, %xmm0
1609 ; X64-SSE-NEXT: retq
1611 ; X64-XOP-LABEL: mul_v2i64_68_132:
1613 ; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [68,132]
1614 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
1615 ; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm0
1616 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
1617 ; X64-XOP-NEXT: vpsllq $32, %xmm0, %xmm0
1618 ; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1619 ; X64-XOP-NEXT: retq
1621 ; X64-AVX2-LABEL: mul_v2i64_68_132:
1622 ; X64-AVX2: # %bb.0:
1623 ; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [68,132]
1624 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
1625 ; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0
1626 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
1627 ; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
1628 ; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1629 ; X64-AVX2-NEXT: retq
1631 ; X64-AVX512DQ-LABEL: mul_v2i64_68_132:
1632 ; X64-AVX512DQ: # %bb.0:
1633 ; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1634 ; X64-AVX512DQ-NEXT: retq
1635 %mul = mul <2 x i64> %x, <i64 68, i64 132>
1639 define <2 x i64> @mul_v2i64_60_120(<2 x i64> %x) nounwind {
1640 ; X86-SSE-LABEL: mul_v2i64_60_120:
1642 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [60,0,124,0]
1643 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1644 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm2
1645 ; X86-SSE-NEXT: psrlq $32, %xmm0
1646 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm0
1647 ; X86-SSE-NEXT: psllq $32, %xmm0
1648 ; X86-SSE-NEXT: paddq %xmm2, %xmm0
1649 ; X86-SSE-NEXT: retl
1651 ; X64-SSE-LABEL: mul_v2i64_60_120:
1653 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [60,124]
1654 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2
1655 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm2
1656 ; X64-SSE-NEXT: psrlq $32, %xmm0
1657 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm0
1658 ; X64-SSE-NEXT: psllq $32, %xmm0
1659 ; X64-SSE-NEXT: paddq %xmm2, %xmm0
1660 ; X64-SSE-NEXT: retq
1662 ; X64-XOP-LABEL: mul_v2i64_60_120:
1664 ; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [60,124]
1665 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
1666 ; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm0
1667 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
1668 ; X64-XOP-NEXT: vpsllq $32, %xmm0, %xmm0
1669 ; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1670 ; X64-XOP-NEXT: retq
1672 ; X64-AVX2-LABEL: mul_v2i64_60_120:
1673 ; X64-AVX2: # %bb.0:
1674 ; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [60,124]
1675 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
1676 ; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0
1677 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
1678 ; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
1679 ; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1680 ; X64-AVX2-NEXT: retq
1682 ; X64-AVX512DQ-LABEL: mul_v2i64_60_120:
1683 ; X64-AVX512DQ: # %bb.0:
1684 ; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1685 ; X64-AVX512DQ-NEXT: retq
1686 %mul = mul <2 x i64> %x, <i64 60, i64 124>
1690 ; We unfortunately can't see the zext that lives in the other basic block so we
1691 ; don't know that we only need one pmuludq to compute the full 64 bits. This
1692 ; sort of issue is more likely to occur when there is a loop and one of the
1693 ; multiply inputs is loop invariant.
1694 ; FIXME: We should be able to insert an AssertZExt for this.
1695 define <2 x i64> @mul_v2i64_zext_cross_bb(<2 x i32>* %in, <2 x i32>* %y) {
1696 ; X86-SSE-LABEL: mul_v2i64_zext_cross_bb:
1698 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1699 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1700 ; X86-SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
1701 ; X86-SSE-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
1702 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm0
1703 ; X86-SSE-NEXT: retl
1705 ; X64-SSE-LABEL: mul_v2i64_zext_cross_bb:
1707 ; X64-SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
1708 ; X64-SSE-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
1709 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm0
1710 ; X64-SSE-NEXT: retq
1712 ; X64-AVX-LABEL: mul_v2i64_zext_cross_bb:
1714 ; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
1715 ; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
1716 ; X64-AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
1717 ; X64-AVX-NEXT: retq
1718 %a = load <2 x i32>, <2 x i32>* %in
1719 %b = zext <2 x i32> %a to <2 x i64>
1723 %c = load <2 x i32>, <2 x i32>* %y
1724 %d = zext <2 x i32> %c to <2 x i64>
1725 %e = mul <2 x i64> %b, %d
1729 define <4 x i64> @mul_v4i64_zext_cross_bb(<4 x i32>* %in, <4 x i32>* %y) {
1730 ; X86-SSE-LABEL: mul_v4i64_zext_cross_bb:
1732 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1733 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1734 ; X86-SSE-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
1735 ; X86-SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
1736 ; X86-SSE-NEXT: pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
1737 ; X86-SSE-NEXT: pmuludq %xmm2, %xmm1
1738 ; X86-SSE-NEXT: pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
1739 ; X86-SSE-NEXT: pmuludq %xmm2, %xmm0
1740 ; X86-SSE-NEXT: retl
1742 ; X64-SSE-LABEL: mul_v4i64_zext_cross_bb:
1744 ; X64-SSE-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
1745 ; X64-SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
1746 ; X64-SSE-NEXT: pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
1747 ; X64-SSE-NEXT: pmuludq %xmm2, %xmm1
1748 ; X64-SSE-NEXT: pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
1749 ; X64-SSE-NEXT: pmuludq %xmm2, %xmm0
1750 ; X64-SSE-NEXT: retq
1752 ; X64-XOP-LABEL: mul_v4i64_zext_cross_bb:
1754 ; X64-XOP-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
1755 ; X64-XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
1756 ; X64-XOP-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1757 ; X64-XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
1758 ; X64-XOP-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
1759 ; X64-XOP-NEXT: vextractf128 $1, %ymm0, %xmm3
1760 ; X64-XOP-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
1761 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
1762 ; X64-XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1763 ; X64-XOP-NEXT: retq
1765 ; X64-AVX2-LABEL: mul_v4i64_zext_cross_bb:
1766 ; X64-AVX2: # %bb.0:
1767 ; X64-AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1768 ; X64-AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1769 ; X64-AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
1770 ; X64-AVX2-NEXT: retq
1772 ; X64-AVX512DQ-LABEL: mul_v4i64_zext_cross_bb:
1773 ; X64-AVX512DQ: # %bb.0:
1774 ; X64-AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1775 ; X64-AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1776 ; X64-AVX512DQ-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
1777 ; X64-AVX512DQ-NEXT: retq
1778 %a = load <4 x i32>, <4 x i32>* %in
1779 %b = zext <4 x i32> %a to <4 x i64>
1783 %c = load <4 x i32>, <4 x i32>* %y
1784 %d = zext <4 x i32> %c to <4 x i64>
1785 %e = mul <4 x i64> %b, %d