1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2,X86-SSE,X86-SSE2
3 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE4,X86-SSE,X86-SSE4
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2,X64-SSE,X64-SSE2
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,-slow-pmulld | FileCheck %s --check-prefixes=SSE,SSE4,X64-SSE,X64-SSE4,X64-SSE4-FAST
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,+slow-pmulld | FileCheck %s --check-prefixes=SSE,SSE4,X64-SSE,X64-SSE4,X64-SSE4-SLOW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefixes=X64-AVX,X64-XOP
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64-AVX,X64-AVX2
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X64-AVX,X64-AVX512DQ
15 define <2 x i64> @mul_v2i64_8(<2 x i64> %a0) nounwind {
16 ; SSE-LABEL: mul_v2i64_8:
18 ; SSE-NEXT: psllq $3, %xmm0
19 ; SSE-NEXT: ret{{[l|q]}}
21 ; X64-AVX-LABEL: mul_v2i64_8:
23 ; X64-AVX-NEXT: vpsllq $3, %xmm0, %xmm0
25 %1 = mul <2 x i64> %a0, <i64 8, i64 8>
29 define <4 x i32> @mul_v4i32_8(<4 x i32> %a0) nounwind {
30 ; SSE-LABEL: mul_v4i32_8:
32 ; SSE-NEXT: pslld $3, %xmm0
33 ; SSE-NEXT: ret{{[l|q]}}
35 ; X64-AVX-LABEL: mul_v4i32_8:
37 ; X64-AVX-NEXT: vpslld $3, %xmm0, %xmm0
39 %1 = mul <4 x i32> %a0, <i32 8, i32 8, i32 8, i32 8>
43 define <8 x i16> @mul_v8i16_8(<8 x i16> %a0) nounwind {
44 ; SSE-LABEL: mul_v8i16_8:
46 ; SSE-NEXT: psllw $3, %xmm0
47 ; SSE-NEXT: ret{{[l|q]}}
49 ; X64-AVX-LABEL: mul_v8i16_8:
51 ; X64-AVX-NEXT: vpsllw $3, %xmm0, %xmm0
53 %1 = mul <8 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
57 define <16 x i8> @mul_v16i8_32(<16 x i8> %a0) nounwind {
58 ; X86-SSE-LABEL: mul_v16i8_32:
60 ; X86-SSE-NEXT: psllw $5, %xmm0
61 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
64 ; X64-SSE-LABEL: mul_v16i8_32:
66 ; X64-SSE-NEXT: psllw $5, %xmm0
67 ; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
70 ; X64-XOP-LABEL: mul_v16i8_32:
72 ; X64-XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
75 ; X64-AVX2-LABEL: mul_v16i8_32:
77 ; X64-AVX2-NEXT: vpsllw $5, %xmm0, %xmm0
78 ; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
81 ; X64-AVX512DQ-LABEL: mul_v16i8_32:
82 ; X64-AVX512DQ: # %bb.0:
83 ; X64-AVX512DQ-NEXT: vpsllw $5, %xmm0, %xmm0
84 ; X64-AVX512DQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
85 ; X64-AVX512DQ-NEXT: retq
86 %1 = mul <16 x i8> %a0, <i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32>
91 ; PowOf2 (non-uniform)
94 define <2 x i64> @mul_v2i64_32_8(<2 x i64> %a0) nounwind {
95 ; SSE2-LABEL: mul_v2i64_32_8:
97 ; SSE2-NEXT: movdqa %xmm0, %xmm1
98 ; SSE2-NEXT: psllq $5, %xmm1
99 ; SSE2-NEXT: psllq $3, %xmm0
100 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
101 ; SSE2-NEXT: ret{{[l|q]}}
103 ; SSE4-LABEL: mul_v2i64_32_8:
105 ; SSE4-NEXT: movdqa %xmm0, %xmm1
106 ; SSE4-NEXT: psllq $3, %xmm1
107 ; SSE4-NEXT: psllq $5, %xmm0
108 ; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
109 ; SSE4-NEXT: ret{{[l|q]}}
111 ; X64-XOP-LABEL: mul_v2i64_32_8:
113 ; X64-XOP-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
116 ; X64-AVX2-LABEL: mul_v2i64_32_8:
118 ; X64-AVX2-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
119 ; X64-AVX2-NEXT: retq
121 ; X64-AVX512DQ-LABEL: mul_v2i64_32_8:
122 ; X64-AVX512DQ: # %bb.0:
123 ; X64-AVX512DQ-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
124 ; X64-AVX512DQ-NEXT: retq
125 %1 = mul <2 x i64> %a0, <i64 32, i64 8>
129 define <4 x i32> @mul_v4i32_1_2_4_8(<4 x i32> %a0) nounwind {
130 ; X86-SSE2-LABEL: mul_v4i32_1_2_4_8:
132 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
133 ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
134 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
135 ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
136 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
137 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
138 ; X86-SSE2-NEXT: retl
140 ; X86-SSE4-LABEL: mul_v4i32_1_2_4_8:
142 ; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
143 ; X86-SSE4-NEXT: retl
145 ; X64-SSE2-LABEL: mul_v4i32_1_2_4_8:
147 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
148 ; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
149 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
150 ; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
151 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
152 ; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
153 ; X64-SSE2-NEXT: retq
155 ; X64-SSE4-LABEL: mul_v4i32_1_2_4_8:
157 ; X64-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
158 ; X64-SSE4-NEXT: retq
160 ; X64-XOP-LABEL: mul_v4i32_1_2_4_8:
162 ; X64-XOP-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
165 ; X64-AVX2-LABEL: mul_v4i32_1_2_4_8:
167 ; X64-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
168 ; X64-AVX2-NEXT: retq
170 ; X64-AVX512DQ-LABEL: mul_v4i32_1_2_4_8:
171 ; X64-AVX512DQ: # %bb.0:
172 ; X64-AVX512DQ-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
173 ; X64-AVX512DQ-NEXT: retq
174 %1 = mul <4 x i32> %a0, <i32 1, i32 2, i32 4, i32 8>
178 define <4 x i32> @mul_v4i32_1_2_4_8_optsize(<4 x i32> %a0) nounwind optsize {
179 ; SSE2-LABEL: mul_v4i32_1_2_4_8_optsize:
181 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8]
182 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
183 ; SSE2-NEXT: pmuludq %xmm1, %xmm0
184 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
185 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
186 ; SSE2-NEXT: pmuludq %xmm2, %xmm1
187 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
188 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
189 ; SSE2-NEXT: ret{{[l|q]}}
191 ; X86-SSE4-LABEL: mul_v4i32_1_2_4_8_optsize:
193 ; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
194 ; X86-SSE4-NEXT: retl
196 ; X64-SSE4-LABEL: mul_v4i32_1_2_4_8_optsize:
198 ; X64-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
199 ; X64-SSE4-NEXT: retq
201 ; X64-XOP-LABEL: mul_v4i32_1_2_4_8_optsize:
203 ; X64-XOP-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
206 ; X64-AVX2-LABEL: mul_v4i32_1_2_4_8_optsize:
208 ; X64-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
209 ; X64-AVX2-NEXT: retq
211 ; X64-AVX512DQ-LABEL: mul_v4i32_1_2_4_8_optsize:
212 ; X64-AVX512DQ: # %bb.0:
213 ; X64-AVX512DQ-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
214 ; X64-AVX512DQ-NEXT: retq
215 %1 = mul <4 x i32> %a0, <i32 1, i32 2, i32 4, i32 8>
219 define <8 x i16> @mul_v8i16_1_2_4_8_16_32_64_128(<8 x i16> %a0) nounwind {
220 ; X86-SSE-LABEL: mul_v8i16_1_2_4_8_16_32_64_128:
222 ; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
225 ; X64-SSE-LABEL: mul_v8i16_1_2_4_8_16_32_64_128:
227 ; X64-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
230 ; X64-XOP-LABEL: mul_v8i16_1_2_4_8_16_32_64_128:
232 ; X64-XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
235 ; X64-AVX2-LABEL: mul_v8i16_1_2_4_8_16_32_64_128:
237 ; X64-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
238 ; X64-AVX2-NEXT: retq
240 ; X64-AVX512DQ-LABEL: mul_v8i16_1_2_4_8_16_32_64_128:
241 ; X64-AVX512DQ: # %bb.0:
242 ; X64-AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
243 ; X64-AVX512DQ-NEXT: retq
244 %1 = mul <8 x i16> %a0, <i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128>
248 define <16 x i8> @mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8(<16 x i8> %a0) nounwind {
249 ; SSE2-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
251 ; SSE2-NEXT: movdqa %xmm0, %xmm1
252 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
253 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,1,2,4,8]
254 ; SSE2-NEXT: pmullw %xmm2, %xmm1
255 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
256 ; SSE2-NEXT: pand %xmm3, %xmm1
257 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
258 ; SSE2-NEXT: pmullw %xmm2, %xmm0
259 ; SSE2-NEXT: pand %xmm3, %xmm0
260 ; SSE2-NEXT: packuswb %xmm1, %xmm0
261 ; SSE2-NEXT: ret{{[l|q]}}
263 ; SSE4-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
265 ; SSE4-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
266 ; SSE4-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
267 ; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,1,2,4,8]
268 ; SSE4-NEXT: pmullw %xmm2, %xmm0
269 ; SSE4-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
270 ; SSE4-NEXT: pand %xmm3, %xmm0
271 ; SSE4-NEXT: pmullw %xmm2, %xmm1
272 ; SSE4-NEXT: pand %xmm3, %xmm1
273 ; SSE4-NEXT: packuswb %xmm0, %xmm1
274 ; SSE4-NEXT: movdqa %xmm1, %xmm0
275 ; SSE4-NEXT: ret{{[l|q]}}
277 ; X64-XOP-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
279 ; X64-XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
282 ; X64-AVX2-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
284 ; X64-AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
285 ; X64-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
286 ; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
287 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
288 ; X64-AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
289 ; X64-AVX2-NEXT: vzeroupper
290 ; X64-AVX2-NEXT: retq
292 ; X64-AVX512DQ-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
293 ; X64-AVX512DQ: # %bb.0:
294 ; X64-AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
295 ; X64-AVX512DQ-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
296 ; X64-AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
297 ; X64-AVX512DQ-NEXT: vzeroupper
298 ; X64-AVX512DQ-NEXT: retq
299 %1 = mul <16 x i8> %a0, <i8 1, i8 2, i8 4, i8 8, i8 1, i8 2, i8 4, i8 8, i8 1, i8 2, i8 4, i8 8, i8 1, i8 2, i8 4, i8 8>
304 ; PowOf2 + 1 (uniform)
307 define <2 x i64> @mul_v2i64_17(<2 x i64> %a0) nounwind {
308 ; SSE-LABEL: mul_v2i64_17:
310 ; SSE-NEXT: movdqa %xmm0, %xmm1
311 ; SSE-NEXT: psllq $4, %xmm1
312 ; SSE-NEXT: paddq %xmm1, %xmm0
313 ; SSE-NEXT: ret{{[l|q]}}
315 ; X64-AVX-LABEL: mul_v2i64_17:
317 ; X64-AVX-NEXT: vpsllq $4, %xmm0, %xmm1
318 ; X64-AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
320 %1 = mul <2 x i64> %a0, <i64 17, i64 17>
324 define <4 x i32> @mul_v4i32_17(<4 x i32> %a0) nounwind {
325 ; SSE-LABEL: mul_v4i32_17:
327 ; SSE-NEXT: movdqa %xmm0, %xmm1
328 ; SSE-NEXT: pslld $4, %xmm1
329 ; SSE-NEXT: paddd %xmm1, %xmm0
330 ; SSE-NEXT: ret{{[l|q]}}
332 ; X64-XOP-LABEL: mul_v4i32_17:
334 ; X64-XOP-NEXT: vpslld $4, %xmm0, %xmm1
335 ; X64-XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm0
338 ; X64-AVX2-LABEL: mul_v4i32_17:
340 ; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [17,17,17,17]
341 ; X64-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
342 ; X64-AVX2-NEXT: retq
344 ; X64-AVX512DQ-LABEL: mul_v4i32_17:
345 ; X64-AVX512DQ: # %bb.0:
346 ; X64-AVX512DQ-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
347 ; X64-AVX512DQ-NEXT: retq
348 %1 = mul <4 x i32> %a0, <i32 17, i32 17, i32 17, i32 17>
352 define <8 x i16> @mul_v8i16_17(<8 x i16> %a0) nounwind {
353 ; SSE-LABEL: mul_v8i16_17:
355 ; SSE-NEXT: movdqa %xmm0, %xmm1
356 ; SSE-NEXT: psllw $4, %xmm1
357 ; SSE-NEXT: paddw %xmm1, %xmm0
358 ; SSE-NEXT: ret{{[l|q]}}
360 ; X64-AVX-LABEL: mul_v8i16_17:
362 ; X64-AVX-NEXT: vpsllw $4, %xmm0, %xmm1
363 ; X64-AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
365 %1 = mul <8 x i16> %a0, <i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17>
369 define <16 x i8> @mul_v16i8_17(<16 x i8> %a0) nounwind {
370 ; X86-SSE-LABEL: mul_v16i8_17:
372 ; X86-SSE-NEXT: movdqa %xmm0, %xmm1
373 ; X86-SSE-NEXT: psllw $4, %xmm1
374 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
375 ; X86-SSE-NEXT: paddb %xmm1, %xmm0
378 ; X64-SSE-LABEL: mul_v16i8_17:
380 ; X64-SSE-NEXT: movdqa %xmm0, %xmm1
381 ; X64-SSE-NEXT: psllw $4, %xmm1
382 ; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
383 ; X64-SSE-NEXT: paddb %xmm1, %xmm0
386 ; X64-XOP-LABEL: mul_v16i8_17:
388 ; X64-XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
389 ; X64-XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0
392 ; X64-AVX2-LABEL: mul_v16i8_17:
394 ; X64-AVX2-NEXT: vpsllw $4, %xmm0, %xmm1
395 ; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
396 ; X64-AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
397 ; X64-AVX2-NEXT: retq
399 ; X64-AVX512DQ-LABEL: mul_v16i8_17:
400 ; X64-AVX512DQ: # %bb.0:
401 ; X64-AVX512DQ-NEXT: vpsllw $4, %xmm0, %xmm1
402 ; X64-AVX512DQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
403 ; X64-AVX512DQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0
404 ; X64-AVX512DQ-NEXT: retq
405 %1 = mul <16 x i8> %a0, <i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17>
409 define <4 x i64> @mul_v4i64_17(<4 x i64> %a0) nounwind {
410 ; SSE-LABEL: mul_v4i64_17:
412 ; SSE-NEXT: movdqa %xmm0, %xmm2
413 ; SSE-NEXT: psllq $4, %xmm2
414 ; SSE-NEXT: paddq %xmm2, %xmm0
415 ; SSE-NEXT: movdqa %xmm1, %xmm2
416 ; SSE-NEXT: psllq $4, %xmm2
417 ; SSE-NEXT: paddq %xmm2, %xmm1
418 ; SSE-NEXT: ret{{[l|q]}}
420 ; X64-XOP-LABEL: mul_v4i64_17:
422 ; X64-XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
423 ; X64-XOP-NEXT: vpsllq $4, %xmm1, %xmm2
424 ; X64-XOP-NEXT: vpaddq %xmm1, %xmm2, %xmm1
425 ; X64-XOP-NEXT: vpsllq $4, %xmm0, %xmm2
426 ; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
427 ; X64-XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
430 ; X64-AVX2-LABEL: mul_v4i64_17:
432 ; X64-AVX2-NEXT: vpsllq $4, %ymm0, %ymm1
433 ; X64-AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0
434 ; X64-AVX2-NEXT: retq
436 ; X64-AVX512DQ-LABEL: mul_v4i64_17:
437 ; X64-AVX512DQ: # %bb.0:
438 ; X64-AVX512DQ-NEXT: vpsllq $4, %ymm0, %ymm1
439 ; X64-AVX512DQ-NEXT: vpaddq %ymm0, %ymm1, %ymm0
440 ; X64-AVX512DQ-NEXT: retq
441 %1 = mul <4 x i64> %a0, <i64 17, i64 17, i64 17, i64 17>
445 define <8 x i32> @mul_v8i32_17(<8 x i32> %a0) nounwind {
446 ; SSE-LABEL: mul_v8i32_17:
448 ; SSE-NEXT: movdqa %xmm0, %xmm2
449 ; SSE-NEXT: pslld $4, %xmm2
450 ; SSE-NEXT: paddd %xmm2, %xmm0
451 ; SSE-NEXT: movdqa %xmm1, %xmm2
452 ; SSE-NEXT: pslld $4, %xmm2
453 ; SSE-NEXT: paddd %xmm2, %xmm1
454 ; SSE-NEXT: ret{{[l|q]}}
456 ; X64-XOP-LABEL: mul_v8i32_17:
458 ; X64-XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
459 ; X64-XOP-NEXT: vpslld $4, %xmm1, %xmm2
460 ; X64-XOP-NEXT: vpaddd %xmm1, %xmm2, %xmm1
461 ; X64-XOP-NEXT: vpslld $4, %xmm0, %xmm2
462 ; X64-XOP-NEXT: vpaddd %xmm0, %xmm2, %xmm0
463 ; X64-XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
466 ; X64-AVX2-LABEL: mul_v8i32_17:
468 ; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [17,17,17,17,17,17,17,17]
469 ; X64-AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
470 ; X64-AVX2-NEXT: retq
472 ; X64-AVX512DQ-LABEL: mul_v8i32_17:
473 ; X64-AVX512DQ: # %bb.0:
474 ; X64-AVX512DQ-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
475 ; X64-AVX512DQ-NEXT: retq
476 %1 = mul <8 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
480 define <16 x i16> @mul_v16i16_17(<16 x i16> %a0) nounwind {
481 ; SSE-LABEL: mul_v16i16_17:
483 ; SSE-NEXT: movdqa %xmm0, %xmm2
484 ; SSE-NEXT: psllw $4, %xmm2
485 ; SSE-NEXT: paddw %xmm2, %xmm0
486 ; SSE-NEXT: movdqa %xmm1, %xmm2
487 ; SSE-NEXT: psllw $4, %xmm2
488 ; SSE-NEXT: paddw %xmm2, %xmm1
489 ; SSE-NEXT: ret{{[l|q]}}
491 ; X64-XOP-LABEL: mul_v16i16_17:
493 ; X64-XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
494 ; X64-XOP-NEXT: vpsllw $4, %xmm1, %xmm2
495 ; X64-XOP-NEXT: vpaddw %xmm1, %xmm2, %xmm1
496 ; X64-XOP-NEXT: vpsllw $4, %xmm0, %xmm2
497 ; X64-XOP-NEXT: vpaddw %xmm0, %xmm2, %xmm0
498 ; X64-XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
501 ; X64-AVX2-LABEL: mul_v16i16_17:
503 ; X64-AVX2-NEXT: vpsllw $4, %ymm0, %ymm1
504 ; X64-AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
505 ; X64-AVX2-NEXT: retq
507 ; X64-AVX512DQ-LABEL: mul_v16i16_17:
508 ; X64-AVX512DQ: # %bb.0:
509 ; X64-AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm1
510 ; X64-AVX512DQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0
511 ; X64-AVX512DQ-NEXT: retq
512 %1 = mul <16 x i16> %a0, <i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17>
516 define <32 x i8> @mul_v32i8_17(<32 x i8> %a0) nounwind {
517 ; SSE-LABEL: mul_v32i8_17:
519 ; SSE-NEXT: movdqa %xmm0, %xmm2
520 ; SSE-NEXT: psllw $4, %xmm2
521 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
522 ; SSE-NEXT: pand %xmm3, %xmm2
523 ; SSE-NEXT: paddb %xmm2, %xmm0
524 ; SSE-NEXT: movdqa %xmm1, %xmm2
525 ; SSE-NEXT: psllw $4, %xmm2
526 ; SSE-NEXT: pand %xmm3, %xmm2
527 ; SSE-NEXT: paddb %xmm2, %xmm1
528 ; SSE-NEXT: ret{{[l|q]}}
530 ; X64-XOP-LABEL: mul_v32i8_17:
532 ; X64-XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
533 ; X64-XOP-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
534 ; X64-XOP-NEXT: vpshlb %xmm2, %xmm1, %xmm3
535 ; X64-XOP-NEXT: vpaddb %xmm1, %xmm3, %xmm1
536 ; X64-XOP-NEXT: vpshlb %xmm2, %xmm0, %xmm2
537 ; X64-XOP-NEXT: vpaddb %xmm0, %xmm2, %xmm0
538 ; X64-XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
541 ; X64-AVX2-LABEL: mul_v32i8_17:
543 ; X64-AVX2-NEXT: vpsllw $4, %ymm0, %ymm1
544 ; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
545 ; X64-AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
546 ; X64-AVX2-NEXT: retq
548 ; X64-AVX512DQ-LABEL: mul_v32i8_17:
549 ; X64-AVX512DQ: # %bb.0:
550 ; X64-AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm1
551 ; X64-AVX512DQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
552 ; X64-AVX512DQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0
553 ; X64-AVX512DQ-NEXT: retq
554 %1 = mul <32 x i8> %a0, <i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17>
559 ; -(PowOf2 + 1) (uniform)
562 define <2 x i64> @mul_v2i64_neg1025(<2 x i64> %a0) nounwind {
563 ; SSE-LABEL: mul_v2i64_neg1025:
565 ; SSE-NEXT: movdqa %xmm0, %xmm1
566 ; SSE-NEXT: psllq $10, %xmm1
567 ; SSE-NEXT: paddq %xmm0, %xmm1
568 ; SSE-NEXT: pxor %xmm0, %xmm0
569 ; SSE-NEXT: psubq %xmm1, %xmm0
570 ; SSE-NEXT: ret{{[l|q]}}
572 ; X64-AVX-LABEL: mul_v2i64_neg1025:
574 ; X64-AVX-NEXT: vpsllq $10, %xmm0, %xmm1
575 ; X64-AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
576 ; X64-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
577 ; X64-AVX-NEXT: vpsubq %xmm0, %xmm1, %xmm0
579 %1 = mul <2 x i64> %a0, <i64 -1025, i64 -1025>
583 define <4 x i32> @mul_v4i32_neg33(<4 x i32> %a0) nounwind {
584 ; SSE-LABEL: mul_v4i32_neg33:
586 ; SSE-NEXT: movdqa %xmm0, %xmm1
587 ; SSE-NEXT: pslld $5, %xmm1
588 ; SSE-NEXT: paddd %xmm0, %xmm1
589 ; SSE-NEXT: pxor %xmm0, %xmm0
590 ; SSE-NEXT: psubd %xmm1, %xmm0
591 ; SSE-NEXT: ret{{[l|q]}}
593 ; X64-XOP-LABEL: mul_v4i32_neg33:
595 ; X64-XOP-NEXT: vpslld $5, %xmm0, %xmm1
596 ; X64-XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm0
597 ; X64-XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
598 ; X64-XOP-NEXT: vpsubd %xmm0, %xmm1, %xmm0
601 ; X64-AVX2-LABEL: mul_v4i32_neg33:
603 ; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967263,4294967263,4294967263,4294967263]
604 ; X64-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
605 ; X64-AVX2-NEXT: retq
607 ; X64-AVX512DQ-LABEL: mul_v4i32_neg33:
608 ; X64-AVX512DQ: # %bb.0:
609 ; X64-AVX512DQ-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
610 ; X64-AVX512DQ-NEXT: retq
611 %1 = mul <4 x i32> %a0, <i32 -33, i32 -33, i32 -33, i32 -33>
615 define <8 x i16> @mul_v8i16_neg9(<8 x i16> %a0) nounwind {
616 ; SSE-LABEL: mul_v8i16_neg9:
618 ; SSE-NEXT: movdqa %xmm0, %xmm1
619 ; SSE-NEXT: psllw $3, %xmm1
620 ; SSE-NEXT: paddw %xmm0, %xmm1
621 ; SSE-NEXT: pxor %xmm0, %xmm0
622 ; SSE-NEXT: psubw %xmm1, %xmm0
623 ; SSE-NEXT: ret{{[l|q]}}
625 ; X64-AVX-LABEL: mul_v8i16_neg9:
627 ; X64-AVX-NEXT: vpsllw $3, %xmm0, %xmm1
628 ; X64-AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
629 ; X64-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
630 ; X64-AVX-NEXT: vpsubw %xmm0, %xmm1, %xmm0
632 %1 = mul <8 x i16> %a0, <i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9>
636 define <16 x i8> @mul_v16i8_neg5(<16 x i8> %a0) nounwind {
637 ; X86-SSE-LABEL: mul_v16i8_neg5:
639 ; X86-SSE-NEXT: movdqa %xmm0, %xmm1
640 ; X86-SSE-NEXT: psllw $2, %xmm1
641 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
642 ; X86-SSE-NEXT: paddb %xmm0, %xmm1
643 ; X86-SSE-NEXT: pxor %xmm0, %xmm0
644 ; X86-SSE-NEXT: psubb %xmm1, %xmm0
647 ; X64-SSE-LABEL: mul_v16i8_neg5:
649 ; X64-SSE-NEXT: movdqa %xmm0, %xmm1
650 ; X64-SSE-NEXT: psllw $2, %xmm1
651 ; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
652 ; X64-SSE-NEXT: paddb %xmm0, %xmm1
653 ; X64-SSE-NEXT: pxor %xmm0, %xmm0
654 ; X64-SSE-NEXT: psubb %xmm1, %xmm0
657 ; X64-XOP-LABEL: mul_v16i8_neg5:
659 ; X64-XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
660 ; X64-XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0
661 ; X64-XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
662 ; X64-XOP-NEXT: vpsubb %xmm0, %xmm1, %xmm0
665 ; X64-AVX2-LABEL: mul_v16i8_neg5:
667 ; X64-AVX2-NEXT: vpsllw $2, %xmm0, %xmm1
668 ; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
669 ; X64-AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
670 ; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
671 ; X64-AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm0
672 ; X64-AVX2-NEXT: retq
674 ; X64-AVX512DQ-LABEL: mul_v16i8_neg5:
675 ; X64-AVX512DQ: # %bb.0:
676 ; X64-AVX512DQ-NEXT: vpsllw $2, %xmm0, %xmm1
677 ; X64-AVX512DQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
678 ; X64-AVX512DQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0
679 ; X64-AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
680 ; X64-AVX512DQ-NEXT: vpsubb %xmm0, %xmm1, %xmm0
681 ; X64-AVX512DQ-NEXT: retq
682 %1 = mul <16 x i8> %a0, <i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5>
686 define <4 x i64> @mul_v4i64_neg1025(<4 x i64> %a0) nounwind {
687 ; SSE-LABEL: mul_v4i64_neg1025:
689 ; SSE-NEXT: movdqa %xmm0, %xmm3
690 ; SSE-NEXT: psllq $10, %xmm3
691 ; SSE-NEXT: paddq %xmm0, %xmm3
692 ; SSE-NEXT: pxor %xmm2, %xmm2
693 ; SSE-NEXT: pxor %xmm0, %xmm0
694 ; SSE-NEXT: psubq %xmm3, %xmm0
695 ; SSE-NEXT: movdqa %xmm1, %xmm3
696 ; SSE-NEXT: psllq $10, %xmm3
697 ; SSE-NEXT: paddq %xmm1, %xmm3
698 ; SSE-NEXT: psubq %xmm3, %xmm2
699 ; SSE-NEXT: movdqa %xmm2, %xmm1
700 ; SSE-NEXT: ret{{[l|q]}}
702 ; X64-XOP-LABEL: mul_v4i64_neg1025:
704 ; X64-XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
705 ; X64-XOP-NEXT: vpsllq $10, %xmm1, %xmm2
706 ; X64-XOP-NEXT: vpaddq %xmm1, %xmm2, %xmm1
707 ; X64-XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
708 ; X64-XOP-NEXT: vpsubq %xmm1, %xmm2, %xmm1
709 ; X64-XOP-NEXT: vpsllq $10, %xmm0, %xmm3
710 ; X64-XOP-NEXT: vpaddq %xmm0, %xmm3, %xmm0
711 ; X64-XOP-NEXT: vpsubq %xmm0, %xmm2, %xmm0
712 ; X64-XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
715 ; X64-AVX2-LABEL: mul_v4i64_neg1025:
717 ; X64-AVX2-NEXT: vpsllq $10, %ymm0, %ymm1
718 ; X64-AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0
719 ; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
720 ; X64-AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm0
721 ; X64-AVX2-NEXT: retq
723 ; X64-AVX512DQ-LABEL: mul_v4i64_neg1025:
724 ; X64-AVX512DQ: # %bb.0:
725 ; X64-AVX512DQ-NEXT: vpsllq $10, %ymm0, %ymm1
726 ; X64-AVX512DQ-NEXT: vpaddq %ymm0, %ymm1, %ymm0
727 ; X64-AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
728 ; X64-AVX512DQ-NEXT: vpsubq %ymm0, %ymm1, %ymm0
729 ; X64-AVX512DQ-NEXT: retq
730 %1 = mul <4 x i64> %a0, <i64 -1025, i64 -1025, i64 -1025, i64 -1025>
734 define <8 x i32> @mul_v8i32_neg33(<8 x i32> %a0) nounwind {
735 ; SSE-LABEL: mul_v8i32_neg33:
737 ; SSE-NEXT: movdqa %xmm0, %xmm3
738 ; SSE-NEXT: pslld $5, %xmm3
739 ; SSE-NEXT: paddd %xmm0, %xmm3
740 ; SSE-NEXT: pxor %xmm2, %xmm2
741 ; SSE-NEXT: pxor %xmm0, %xmm0
742 ; SSE-NEXT: psubd %xmm3, %xmm0
743 ; SSE-NEXT: movdqa %xmm1, %xmm3
744 ; SSE-NEXT: pslld $5, %xmm3
745 ; SSE-NEXT: paddd %xmm1, %xmm3
746 ; SSE-NEXT: psubd %xmm3, %xmm2
747 ; SSE-NEXT: movdqa %xmm2, %xmm1
748 ; SSE-NEXT: ret{{[l|q]}}
750 ; X64-XOP-LABEL: mul_v8i32_neg33:
752 ; X64-XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
753 ; X64-XOP-NEXT: vpslld $5, %xmm1, %xmm2
754 ; X64-XOP-NEXT: vpaddd %xmm1, %xmm2, %xmm1
755 ; X64-XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
756 ; X64-XOP-NEXT: vpsubd %xmm1, %xmm2, %xmm1
757 ; X64-XOP-NEXT: vpslld $5, %xmm0, %xmm3
758 ; X64-XOP-NEXT: vpaddd %xmm0, %xmm3, %xmm0
759 ; X64-XOP-NEXT: vpsubd %xmm0, %xmm2, %xmm0
760 ; X64-XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
763 ; X64-AVX2-LABEL: mul_v8i32_neg33:
765 ; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967263,4294967263,4294967263,4294967263,4294967263,4294967263,4294967263,4294967263]
766 ; X64-AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
767 ; X64-AVX2-NEXT: retq
769 ; X64-AVX512DQ-LABEL: mul_v8i32_neg33:
770 ; X64-AVX512DQ: # %bb.0:
771 ; X64-AVX512DQ-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
772 ; X64-AVX512DQ-NEXT: retq
773 %1 = mul <8 x i32> %a0, <i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 -33>
777 define <16 x i16> @mul_v16i16_neg9(<16 x i16> %a0) nounwind {
778 ; SSE-LABEL: mul_v16i16_neg9:
780 ; SSE-NEXT: movdqa %xmm0, %xmm3
781 ; SSE-NEXT: psllw $3, %xmm3
782 ; SSE-NEXT: paddw %xmm0, %xmm3
783 ; SSE-NEXT: pxor %xmm2, %xmm2
784 ; SSE-NEXT: pxor %xmm0, %xmm0
785 ; SSE-NEXT: psubw %xmm3, %xmm0
786 ; SSE-NEXT: movdqa %xmm1, %xmm3
787 ; SSE-NEXT: psllw $3, %xmm3
788 ; SSE-NEXT: paddw %xmm1, %xmm3
789 ; SSE-NEXT: psubw %xmm3, %xmm2
790 ; SSE-NEXT: movdqa %xmm2, %xmm1
791 ; SSE-NEXT: ret{{[l|q]}}
793 ; X64-XOP-LABEL: mul_v16i16_neg9:
795 ; X64-XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
796 ; X64-XOP-NEXT: vpsllw $3, %xmm1, %xmm2
797 ; X64-XOP-NEXT: vpaddw %xmm1, %xmm2, %xmm1
798 ; X64-XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
799 ; X64-XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1
800 ; X64-XOP-NEXT: vpsllw $3, %xmm0, %xmm3
801 ; X64-XOP-NEXT: vpaddw %xmm0, %xmm3, %xmm0
802 ; X64-XOP-NEXT: vpsubw %xmm0, %xmm2, %xmm0
803 ; X64-XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
806 ; X64-AVX2-LABEL: mul_v16i16_neg9:
808 ; X64-AVX2-NEXT: vpsllw $3, %ymm0, %ymm1
809 ; X64-AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
810 ; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
811 ; X64-AVX2-NEXT: vpsubw %ymm0, %ymm1, %ymm0
812 ; X64-AVX2-NEXT: retq
814 ; X64-AVX512DQ-LABEL: mul_v16i16_neg9:
815 ; X64-AVX512DQ: # %bb.0:
816 ; X64-AVX512DQ-NEXT: vpsllw $3, %ymm0, %ymm1
817 ; X64-AVX512DQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0
818 ; X64-AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
819 ; X64-AVX512DQ-NEXT: vpsubw %ymm0, %ymm1, %ymm0
820 ; X64-AVX512DQ-NEXT: retq
821 %1 = mul <16 x i16> %a0, <i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9>
825 define <32 x i8> @mul_v32i8_neg5(<32 x i8> %a0) nounwind {
826 ; SSE-LABEL: mul_v32i8_neg5:
828 ; SSE-NEXT: movdqa %xmm0, %xmm3
829 ; SSE-NEXT: psllw $2, %xmm3
830 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
831 ; SSE-NEXT: pand %xmm4, %xmm3
832 ; SSE-NEXT: paddb %xmm0, %xmm3
833 ; SSE-NEXT: pxor %xmm2, %xmm2
834 ; SSE-NEXT: pxor %xmm0, %xmm0
835 ; SSE-NEXT: psubb %xmm3, %xmm0
836 ; SSE-NEXT: movdqa %xmm1, %xmm3
837 ; SSE-NEXT: psllw $2, %xmm3
838 ; SSE-NEXT: pand %xmm4, %xmm3
839 ; SSE-NEXT: paddb %xmm1, %xmm3
840 ; SSE-NEXT: psubb %xmm3, %xmm2
841 ; SSE-NEXT: movdqa %xmm2, %xmm1
842 ; SSE-NEXT: ret{{[l|q]}}
844 ; X64-XOP-LABEL: mul_v32i8_neg5:
846 ; X64-XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
847 ; X64-XOP-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
848 ; X64-XOP-NEXT: vpshlb %xmm2, %xmm1, %xmm3
849 ; X64-XOP-NEXT: vpaddb %xmm1, %xmm3, %xmm1
850 ; X64-XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3
851 ; X64-XOP-NEXT: vpsubb %xmm1, %xmm3, %xmm1
852 ; X64-XOP-NEXT: vpshlb %xmm2, %xmm0, %xmm2
853 ; X64-XOP-NEXT: vpaddb %xmm0, %xmm2, %xmm0
854 ; X64-XOP-NEXT: vpsubb %xmm0, %xmm3, %xmm0
855 ; X64-XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
858 ; X64-AVX2-LABEL: mul_v32i8_neg5:
860 ; X64-AVX2-NEXT: vpsllw $2, %ymm0, %ymm1
861 ; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
862 ; X64-AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
863 ; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
864 ; X64-AVX2-NEXT: vpsubb %ymm0, %ymm1, %ymm0
865 ; X64-AVX2-NEXT: retq
867 ; X64-AVX512DQ-LABEL: mul_v32i8_neg5:
868 ; X64-AVX512DQ: # %bb.0:
869 ; X64-AVX512DQ-NEXT: vpsllw $2, %ymm0, %ymm1
870 ; X64-AVX512DQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
871 ; X64-AVX512DQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0
872 ; X64-AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
873 ; X64-AVX512DQ-NEXT: vpsubb %ymm0, %ymm1, %ymm0
874 ; X64-AVX512DQ-NEXT: retq
875 %1 = mul <32 x i8> %a0, <i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5>
880 ; PowOf2 + 1 (non-uniform)
883 define <2 x i64> @mul_v2i64_17_65(<2 x i64> %a0) nounwind {
884 ; X86-SSE-LABEL: mul_v2i64_17_65:
886 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [17,0,65,0]
887 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
888 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm2
889 ; X86-SSE-NEXT: psrlq $32, %xmm0
890 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm0
891 ; X86-SSE-NEXT: psllq $32, %xmm0
892 ; X86-SSE-NEXT: paddq %xmm2, %xmm0
895 ; X64-SSE-LABEL: mul_v2i64_17_65:
897 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [17,65]
898 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2
899 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm2
900 ; X64-SSE-NEXT: psrlq $32, %xmm0
901 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm0
902 ; X64-SSE-NEXT: psllq $32, %xmm0
903 ; X64-SSE-NEXT: paddq %xmm2, %xmm0
906 ; X64-XOP-LABEL: mul_v2i64_17_65:
908 ; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [17,65]
909 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
910 ; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm0
911 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
912 ; X64-XOP-NEXT: vpsllq $32, %xmm0, %xmm0
913 ; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
916 ; X64-AVX2-LABEL: mul_v2i64_17_65:
918 ; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [17,65]
919 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
920 ; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0
921 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
922 ; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
923 ; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0
924 ; X64-AVX2-NEXT: retq
926 ; X64-AVX512DQ-LABEL: mul_v2i64_17_65:
927 ; X64-AVX512DQ: # %bb.0:
928 ; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
929 ; X64-AVX512DQ-NEXT: retq
930 %1 = mul <2 x i64> %a0, <i64 17, i64 65>
934 define <4 x i32> @mul_v4i32_5_17_33_65(<4 x i32> %a0) nounwind {
935 ; X86-SSE2-LABEL: mul_v4i32_5_17_33_65:
937 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
938 ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
939 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
940 ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
941 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
942 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
943 ; X86-SSE2-NEXT: retl
945 ; X86-SSE4-LABEL: mul_v4i32_5_17_33_65:
947 ; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
948 ; X86-SSE4-NEXT: retl
950 ; X64-SSE2-LABEL: mul_v4i32_5_17_33_65:
952 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
953 ; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
954 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
955 ; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
956 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
957 ; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
958 ; X64-SSE2-NEXT: retq
960 ; X64-SSE4-LABEL: mul_v4i32_5_17_33_65:
962 ; X64-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
963 ; X64-SSE4-NEXT: retq
965 ; X64-AVX-LABEL: mul_v4i32_5_17_33_65:
967 ; X64-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
969 %1 = mul <4 x i32> %a0, <i32 5, i32 17, i32 33, i32 65>
973 define <8 x i16> @mul_v8i16_2_3_9_17_33_65_129_257(<8 x i16> %a0) nounwind {
974 ; X86-SSE-LABEL: mul_v8i16_2_3_9_17_33_65_129_257:
976 ; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
979 ; X64-SSE-LABEL: mul_v8i16_2_3_9_17_33_65_129_257:
981 ; X64-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
984 ; X64-AVX-LABEL: mul_v8i16_2_3_9_17_33_65_129_257:
986 ; X64-AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
988 %1 = mul <8 x i16> %a0, <i16 2, i16 3, i16 9, i16 17, i16 33, i16 65, i16 129, i16 257>
992 define <16 x i8> @mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3(<16 x i8> %a0) nounwind {
993 ; X86-SSE2-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
995 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
996 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
997 ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
998 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
999 ; X86-SSE2-NEXT: pand %xmm2, %xmm1
1000 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1001 ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1002 ; X86-SSE2-NEXT: pand %xmm2, %xmm0
1003 ; X86-SSE2-NEXT: packuswb %xmm1, %xmm0
1004 ; X86-SSE2-NEXT: retl
1006 ; X86-SSE4-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
1007 ; X86-SSE4: # %bb.0:
1008 ; X86-SSE4-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1009 ; X86-SSE4-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1010 ; X86-SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1011 ; X86-SSE4-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1012 ; X86-SSE4-NEXT: pand %xmm2, %xmm0
1013 ; X86-SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1014 ; X86-SSE4-NEXT: pand %xmm2, %xmm1
1015 ; X86-SSE4-NEXT: packuswb %xmm0, %xmm1
1016 ; X86-SSE4-NEXT: movdqa %xmm1, %xmm0
1017 ; X86-SSE4-NEXT: retl
1019 ; X64-SSE2-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
1020 ; X64-SSE2: # %bb.0:
1021 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
1022 ; X64-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1023 ; X64-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1024 ; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1025 ; X64-SSE2-NEXT: pand %xmm2, %xmm1
1026 ; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1027 ; X64-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1028 ; X64-SSE2-NEXT: pand %xmm2, %xmm0
1029 ; X64-SSE2-NEXT: packuswb %xmm1, %xmm0
1030 ; X64-SSE2-NEXT: retq
1032 ; X64-SSE4-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
1033 ; X64-SSE4: # %bb.0:
1034 ; X64-SSE4-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1035 ; X64-SSE4-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1036 ; X64-SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1037 ; X64-SSE4-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1038 ; X64-SSE4-NEXT: pand %xmm2, %xmm0
1039 ; X64-SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1040 ; X64-SSE4-NEXT: pand %xmm2, %xmm1
1041 ; X64-SSE4-NEXT: packuswb %xmm0, %xmm1
1042 ; X64-SSE4-NEXT: movdqa %xmm1, %xmm0
1043 ; X64-SSE4-NEXT: retq
1045 ; X64-XOP-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
1047 ; X64-XOP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1048 ; X64-XOP-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1049 ; X64-XOP-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1050 ; X64-XOP-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1051 ; X64-XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],xmm1[0,2,4,6,8,10,12,14]
1052 ; X64-XOP-NEXT: retq
1054 ; X64-AVX2-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
1055 ; X64-AVX2: # %bb.0:
1056 ; X64-AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1057 ; X64-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1058 ; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1059 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1060 ; X64-AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1061 ; X64-AVX2-NEXT: vzeroupper
1062 ; X64-AVX2-NEXT: retq
1064 ; X64-AVX512DQ-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
1065 ; X64-AVX512DQ: # %bb.0:
1066 ; X64-AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1067 ; X64-AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1068 ; X64-AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1069 ; X64-AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
1070 ; X64-AVX512DQ-NEXT: vzeroupper
1071 ; X64-AVX512DQ-NEXT: retq
1072 %1 = mul <16 x i8> %a0, <i8 2, i8 3, i8 9, i8 17, i8 33, i8 65, i8 129, i8 2, i8 3, i8 9, i8 17, i8 33, i8 65, i8 129, i8 2, i8 3>
1077 ; PowOf2 - 1 (uniform)
1080 define <2 x i64> @mul_v2i64_7(<2 x i64> %a0) nounwind {
1081 ; SSE-LABEL: mul_v2i64_7:
1083 ; SSE-NEXT: movdqa %xmm0, %xmm1
1084 ; SSE-NEXT: psllq $3, %xmm1
1085 ; SSE-NEXT: psubq %xmm0, %xmm1
1086 ; SSE-NEXT: movdqa %xmm1, %xmm0
1087 ; SSE-NEXT: ret{{[l|q]}}
1089 ; X64-AVX-LABEL: mul_v2i64_7:
1091 ; X64-AVX-NEXT: vpsllq $3, %xmm0, %xmm1
1092 ; X64-AVX-NEXT: vpsubq %xmm0, %xmm1, %xmm0
1093 ; X64-AVX-NEXT: retq
1094 %1 = mul <2 x i64> %a0, <i64 7, i64 7>
1098 define <4 x i32> @mul_v4i32_7(<4 x i32> %a0) nounwind {
1099 ; SSE-LABEL: mul_v4i32_7:
1101 ; SSE-NEXT: movdqa %xmm0, %xmm1
1102 ; SSE-NEXT: pslld $3, %xmm1
1103 ; SSE-NEXT: psubd %xmm0, %xmm1
1104 ; SSE-NEXT: movdqa %xmm1, %xmm0
1105 ; SSE-NEXT: ret{{[l|q]}}
1107 ; X64-XOP-LABEL: mul_v4i32_7:
1109 ; X64-XOP-NEXT: vpslld $3, %xmm0, %xmm1
1110 ; X64-XOP-NEXT: vpsubd %xmm0, %xmm1, %xmm0
1111 ; X64-XOP-NEXT: retq
1113 ; X64-AVX2-LABEL: mul_v4i32_7:
1114 ; X64-AVX2: # %bb.0:
1115 ; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7]
1116 ; X64-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1117 ; X64-AVX2-NEXT: retq
1119 ; X64-AVX512DQ-LABEL: mul_v4i32_7:
1120 ; X64-AVX512DQ: # %bb.0:
1121 ; X64-AVX512DQ-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
1122 ; X64-AVX512DQ-NEXT: retq
1123 %1 = mul <4 x i32> %a0, <i32 7, i32 7, i32 7, i32 7>
1127 define <8 x i16> @mul_v8i16_7(<8 x i16> %a0) nounwind {
1128 ; SSE-LABEL: mul_v8i16_7:
1130 ; SSE-NEXT: movdqa %xmm0, %xmm1
1131 ; SSE-NEXT: psllw $3, %xmm1
1132 ; SSE-NEXT: psubw %xmm0, %xmm1
1133 ; SSE-NEXT: movdqa %xmm1, %xmm0
1134 ; SSE-NEXT: ret{{[l|q]}}
1136 ; X64-AVX-LABEL: mul_v8i16_7:
1138 ; X64-AVX-NEXT: vpsllw $3, %xmm0, %xmm1
1139 ; X64-AVX-NEXT: vpsubw %xmm0, %xmm1, %xmm0
1140 ; X64-AVX-NEXT: retq
1141 %1 = mul <8 x i16> %a0, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
1145 define <16 x i8> @mul_v16i8_31(<16 x i8> %a0) nounwind {
1146 ; X86-SSE-LABEL: mul_v16i8_31:
1148 ; X86-SSE-NEXT: movdqa %xmm0, %xmm1
1149 ; X86-SSE-NEXT: psllw $5, %xmm1
1150 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1151 ; X86-SSE-NEXT: psubb %xmm0, %xmm1
1152 ; X86-SSE-NEXT: movdqa %xmm1, %xmm0
1153 ; X86-SSE-NEXT: retl
1155 ; X64-SSE-LABEL: mul_v16i8_31:
1157 ; X64-SSE-NEXT: movdqa %xmm0, %xmm1
1158 ; X64-SSE-NEXT: psllw $5, %xmm1
1159 ; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1160 ; X64-SSE-NEXT: psubb %xmm0, %xmm1
1161 ; X64-SSE-NEXT: movdqa %xmm1, %xmm0
1162 ; X64-SSE-NEXT: retq
1164 ; X64-XOP-LABEL: mul_v16i8_31:
1166 ; X64-XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1167 ; X64-XOP-NEXT: vpsubb %xmm0, %xmm1, %xmm0
1168 ; X64-XOP-NEXT: retq
1170 ; X64-AVX2-LABEL: mul_v16i8_31:
1171 ; X64-AVX2: # %bb.0:
1172 ; X64-AVX2-NEXT: vpsllw $5, %xmm0, %xmm1
1173 ; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1174 ; X64-AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm0
1175 ; X64-AVX2-NEXT: retq
1177 ; X64-AVX512DQ-LABEL: mul_v16i8_31:
1178 ; X64-AVX512DQ: # %bb.0:
1179 ; X64-AVX512DQ-NEXT: vpsllw $5, %xmm0, %xmm1
1180 ; X64-AVX512DQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
1181 ; X64-AVX512DQ-NEXT: vpsubb %xmm0, %xmm1, %xmm0
1182 ; X64-AVX512DQ-NEXT: retq
1183 %1 = mul <16 x i8> %a0, <i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31>
1188 ; -(PowOf2 - 1) (uniform)
1191 define <2 x i64> @mul_v2i64_neg7(<2 x i64> %a0) nounwind {
1192 ; SSE-LABEL: mul_v2i64_neg7:
1194 ; SSE-NEXT: movdqa %xmm0, %xmm1
1195 ; SSE-NEXT: psllq $3, %xmm1
1196 ; SSE-NEXT: psubq %xmm1, %xmm0
1197 ; SSE-NEXT: ret{{[l|q]}}
1199 ; X64-AVX-LABEL: mul_v2i64_neg7:
1201 ; X64-AVX-NEXT: vpsllq $3, %xmm0, %xmm1
1202 ; X64-AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm0
1203 ; X64-AVX-NEXT: retq
1204 %1 = mul <2 x i64> %a0, <i64 -7, i64 -7>
1208 define <4 x i32> @mul_v4i32_neg63(<4 x i32> %a0) nounwind {
1209 ; SSE-LABEL: mul_v4i32_neg63:
1211 ; SSE-NEXT: movdqa %xmm0, %xmm1
1212 ; SSE-NEXT: pslld $6, %xmm1
1213 ; SSE-NEXT: psubd %xmm1, %xmm0
1214 ; SSE-NEXT: ret{{[l|q]}}
1216 ; X64-XOP-LABEL: mul_v4i32_neg63:
1218 ; X64-XOP-NEXT: vpslld $6, %xmm0, %xmm1
1219 ; X64-XOP-NEXT: vpsubd %xmm1, %xmm0, %xmm0
1220 ; X64-XOP-NEXT: retq
1222 ; X64-AVX2-LABEL: mul_v4i32_neg63:
1223 ; X64-AVX2: # %bb.0:
1224 ; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967233,4294967233,4294967233,4294967233]
1225 ; X64-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1226 ; X64-AVX2-NEXT: retq
1228 ; X64-AVX512DQ-LABEL: mul_v4i32_neg63:
1229 ; X64-AVX512DQ: # %bb.0:
1230 ; X64-AVX512DQ-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
1231 ; X64-AVX512DQ-NEXT: retq
1232 %1 = mul <4 x i32> %a0, <i32 -63, i32 -63, i32 -63, i32 -63>
1236 define <8 x i16> @mul_v8i16_neg31(<8 x i16> %a0) nounwind {
1237 ; SSE-LABEL: mul_v8i16_neg31:
1239 ; SSE-NEXT: movdqa %xmm0, %xmm1
1240 ; SSE-NEXT: psllw $5, %xmm1
1241 ; SSE-NEXT: psubw %xmm1, %xmm0
1242 ; SSE-NEXT: ret{{[l|q]}}
1244 ; X64-AVX-LABEL: mul_v8i16_neg31:
1246 ; X64-AVX-NEXT: vpsllw $5, %xmm0, %xmm1
1247 ; X64-AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
1248 ; X64-AVX-NEXT: retq
1249 %1 = mul <8 x i16> %a0, <i16 -31, i16 -31, i16 -31, i16 -31, i16 -31, i16 -31, i16 -31, i16 -31>
1253 define <16 x i8> @mul_v16i8_neg15(<16 x i8> %a0) nounwind {
1254 ; X86-SSE-LABEL: mul_v16i8_neg15:
1256 ; X86-SSE-NEXT: movdqa %xmm0, %xmm1
1257 ; X86-SSE-NEXT: psllw $4, %xmm1
1258 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1259 ; X86-SSE-NEXT: psubb %xmm1, %xmm0
1260 ; X86-SSE-NEXT: retl
1262 ; X64-SSE-LABEL: mul_v16i8_neg15:
1264 ; X64-SSE-NEXT: movdqa %xmm0, %xmm1
1265 ; X64-SSE-NEXT: psllw $4, %xmm1
1266 ; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1267 ; X64-SSE-NEXT: psubb %xmm1, %xmm0
1268 ; X64-SSE-NEXT: retq
1270 ; X64-XOP-LABEL: mul_v16i8_neg15:
1272 ; X64-XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1273 ; X64-XOP-NEXT: vpsubb %xmm1, %xmm0, %xmm0
1274 ; X64-XOP-NEXT: retq
1276 ; X64-AVX2-LABEL: mul_v16i8_neg15:
1277 ; X64-AVX2: # %bb.0:
1278 ; X64-AVX2-NEXT: vpsllw $4, %xmm0, %xmm1
1279 ; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1280 ; X64-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
1281 ; X64-AVX2-NEXT: retq
1283 ; X64-AVX512DQ-LABEL: mul_v16i8_neg15:
1284 ; X64-AVX512DQ: # %bb.0:
1285 ; X64-AVX512DQ-NEXT: vpsllw $4, %xmm0, %xmm1
1286 ; X64-AVX512DQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
1287 ; X64-AVX512DQ-NEXT: vpsubb %xmm1, %xmm0, %xmm0
1288 ; X64-AVX512DQ-NEXT: retq
1289 %1 = mul <16 x i8> %a0, <i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15>
1294 ; PowOf2 - 1 (non-uniform)
1297 define <2 x i64> @mul_v2i64_15_63(<2 x i64> %a0) nounwind {
1298 ; X86-SSE-LABEL: mul_v2i64_15_63:
1300 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,0,63,0]
1301 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1302 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm2
1303 ; X86-SSE-NEXT: psrlq $32, %xmm0
1304 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm0
1305 ; X86-SSE-NEXT: psllq $32, %xmm0
1306 ; X86-SSE-NEXT: paddq %xmm2, %xmm0
1307 ; X86-SSE-NEXT: retl
1309 ; X64-SSE-LABEL: mul_v2i64_15_63:
1311 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,63]
1312 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2
1313 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm2
1314 ; X64-SSE-NEXT: psrlq $32, %xmm0
1315 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm0
1316 ; X64-SSE-NEXT: psllq $32, %xmm0
1317 ; X64-SSE-NEXT: paddq %xmm2, %xmm0
1318 ; X64-SSE-NEXT: retq
1320 ; X64-XOP-LABEL: mul_v2i64_15_63:
1322 ; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [15,63]
1323 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
1324 ; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm0
1325 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
1326 ; X64-XOP-NEXT: vpsllq $32, %xmm0, %xmm0
1327 ; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1328 ; X64-XOP-NEXT: retq
1330 ; X64-AVX2-LABEL: mul_v2i64_15_63:
1331 ; X64-AVX2: # %bb.0:
1332 ; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,63]
1333 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
1334 ; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0
1335 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
1336 ; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
1337 ; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1338 ; X64-AVX2-NEXT: retq
1340 ; X64-AVX512DQ-LABEL: mul_v2i64_15_63:
1341 ; X64-AVX512DQ: # %bb.0:
1342 ; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1343 ; X64-AVX512DQ-NEXT: retq
1344 %1 = mul <2 x i64> %a0, <i64 15, i64 63>
1348 define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind {
1349 ; X86-SSE-LABEL: mul_v2i64_neg_15_63:
1351 ; X86-SSE-NEXT: pcmpeqd %xmm1, %xmm1
1352 ; X86-SSE-NEXT: pmuludq %xmm0, %xmm1
1353 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1354 ; X86-SSE-NEXT: psrlq $32, %xmm2
1355 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4294967281,4294967295,4294967233,4294967295]
1356 ; X86-SSE-NEXT: pmuludq %xmm3, %xmm2
1357 ; X86-SSE-NEXT: paddq %xmm1, %xmm2
1358 ; X86-SSE-NEXT: psllq $32, %xmm2
1359 ; X86-SSE-NEXT: pmuludq %xmm3, %xmm0
1360 ; X86-SSE-NEXT: paddq %xmm2, %xmm0
1361 ; X86-SSE-NEXT: retl
1363 ; X64-SSE-LABEL: mul_v2i64_neg_15_63:
1365 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553]
1366 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2
1367 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm2
1368 ; X64-SSE-NEXT: movdqa %xmm0, %xmm3
1369 ; X64-SSE-NEXT: psrlq $32, %xmm3
1370 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm3
1371 ; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1372 ; X64-SSE-NEXT: paddq %xmm3, %xmm0
1373 ; X64-SSE-NEXT: psllq $32, %xmm0
1374 ; X64-SSE-NEXT: paddq %xmm2, %xmm0
1375 ; X64-SSE-NEXT: retq
1377 ; X64-XOP-LABEL: mul_v2i64_neg_15_63:
1379 ; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553]
1380 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
1381 ; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm3
1382 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
1383 ; X64-XOP-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1384 ; X64-XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0
1385 ; X64-XOP-NEXT: vpsllq $32, %xmm0, %xmm0
1386 ; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1387 ; X64-XOP-NEXT: retq
1389 ; X64-AVX2-LABEL: mul_v2i64_neg_15_63:
1390 ; X64-AVX2: # %bb.0:
1391 ; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553]
1392 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
1393 ; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3
1394 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
1395 ; X64-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1396 ; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
1397 ; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
1398 ; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1399 ; X64-AVX2-NEXT: retq
1401 ; X64-AVX512DQ-LABEL: mul_v2i64_neg_15_63:
1402 ; X64-AVX512DQ: # %bb.0:
1403 ; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1404 ; X64-AVX512DQ-NEXT: retq
1405 %1 = mul <2 x i64> %a0, <i64 -15, i64 -63>
1409 define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind {
1410 ; X86-SSE-LABEL: mul_v2i64_neg_17_65:
1412 ; X86-SSE-NEXT: pcmpeqd %xmm1, %xmm1
1413 ; X86-SSE-NEXT: pmuludq %xmm0, %xmm1
1414 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1415 ; X86-SSE-NEXT: psrlq $32, %xmm2
1416 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4294967279,4294967295,4294967231,4294967295]
1417 ; X86-SSE-NEXT: pmuludq %xmm3, %xmm2
1418 ; X86-SSE-NEXT: paddq %xmm1, %xmm2
1419 ; X86-SSE-NEXT: psllq $32, %xmm2
1420 ; X86-SSE-NEXT: pmuludq %xmm3, %xmm0
1421 ; X86-SSE-NEXT: paddq %xmm2, %xmm0
1422 ; X86-SSE-NEXT: retl
1424 ; X64-SSE-LABEL: mul_v2i64_neg_17_65:
1426 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551]
1427 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2
1428 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm2
1429 ; X64-SSE-NEXT: movdqa %xmm0, %xmm3
1430 ; X64-SSE-NEXT: psrlq $32, %xmm3
1431 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm3
1432 ; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1433 ; X64-SSE-NEXT: paddq %xmm3, %xmm0
1434 ; X64-SSE-NEXT: psllq $32, %xmm0
1435 ; X64-SSE-NEXT: paddq %xmm2, %xmm0
1436 ; X64-SSE-NEXT: retq
1438 ; X64-XOP-LABEL: mul_v2i64_neg_17_65:
1440 ; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551]
1441 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
1442 ; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm3
1443 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
1444 ; X64-XOP-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1445 ; X64-XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0
1446 ; X64-XOP-NEXT: vpsllq $32, %xmm0, %xmm0
1447 ; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1448 ; X64-XOP-NEXT: retq
1450 ; X64-AVX2-LABEL: mul_v2i64_neg_17_65:
1451 ; X64-AVX2: # %bb.0:
1452 ; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551]
1453 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
1454 ; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3
1455 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
1456 ; X64-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1457 ; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
1458 ; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
1459 ; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1460 ; X64-AVX2-NEXT: retq
1462 ; X64-AVX512DQ-LABEL: mul_v2i64_neg_17_65:
1463 ; X64-AVX512DQ: # %bb.0:
1464 ; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1465 ; X64-AVX512DQ-NEXT: retq
1466 %1 = mul <2 x i64> %a0, <i64 -17, i64 -65>
1470 define <2 x i64> @mul_v2i64_0_1(<2 x i64> %a0) nounwind {
1471 ; X86-SSE2-LABEL: mul_v2i64_0_1:
1472 ; X86-SSE2: # %bb.0:
1473 ; X86-SSE2-NEXT: xorpd %xmm1, %xmm1
1474 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1475 ; X86-SSE2-NEXT: retl
1477 ; SSE4-LABEL: mul_v2i64_0_1:
1479 ; SSE4-NEXT: xorps %xmm1, %xmm1
1480 ; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1481 ; SSE4-NEXT: ret{{[l|q]}}
1483 ; X64-SSE2-LABEL: mul_v2i64_0_1:
1484 ; X64-SSE2: # %bb.0:
1485 ; X64-SSE2-NEXT: xorps %xmm1, %xmm1
1486 ; X64-SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1487 ; X64-SSE2-NEXT: movaps %xmm1, %xmm0
1488 ; X64-SSE2-NEXT: retq
1490 ; X64-AVX-LABEL: mul_v2i64_0_1:
1492 ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1493 ; X64-AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1494 ; X64-AVX-NEXT: retq
1495 %1 = mul <2 x i64> %a0, <i64 0, i64 1>
1499 define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind {
1500 ; X86-SSE-LABEL: mul_v2i64_neg_0_1:
1502 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,4294967295,4294967295]
1503 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1504 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm2
1505 ; X86-SSE-NEXT: movdqa %xmm0, %xmm3
1506 ; X86-SSE-NEXT: psrlq $32, %xmm3
1507 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm3
1508 ; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1509 ; X86-SSE-NEXT: paddq %xmm3, %xmm0
1510 ; X86-SSE-NEXT: psllq $32, %xmm0
1511 ; X86-SSE-NEXT: paddq %xmm2, %xmm0
1512 ; X86-SSE-NEXT: retl
1514 ; X64-SSE-LABEL: mul_v2i64_neg_0_1:
1516 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255]
1517 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2
1518 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm2
1519 ; X64-SSE-NEXT: movdqa %xmm0, %xmm3
1520 ; X64-SSE-NEXT: psrlq $32, %xmm3
1521 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm3
1522 ; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1523 ; X64-SSE-NEXT: paddq %xmm3, %xmm0
1524 ; X64-SSE-NEXT: psllq $32, %xmm0
1525 ; X64-SSE-NEXT: paddq %xmm2, %xmm0
1526 ; X64-SSE-NEXT: retq
1528 ; X64-XOP-LABEL: mul_v2i64_neg_0_1:
1530 ; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255]
1531 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
1532 ; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm3
1533 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
1534 ; X64-XOP-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1535 ; X64-XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0
1536 ; X64-XOP-NEXT: vpsllq $32, %xmm0, %xmm0
1537 ; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1538 ; X64-XOP-NEXT: retq
1540 ; X64-AVX2-LABEL: mul_v2i64_neg_0_1:
1541 ; X64-AVX2: # %bb.0:
1542 ; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255]
1543 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
1544 ; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3
1545 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
1546 ; X64-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1547 ; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
1548 ; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
1549 ; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1550 ; X64-AVX2-NEXT: retq
1552 ; X64-AVX512DQ-LABEL: mul_v2i64_neg_0_1:
1553 ; X64-AVX512DQ: # %bb.0:
1554 ; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1555 ; X64-AVX512DQ-NEXT: retq
1556 %1 = mul <2 x i64> %a0, <i64 0, i64 -1>
1560 define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind {
1561 ; X86-SSE-LABEL: mul_v2i64_15_neg_63:
1563 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,0,4294967233,4294967295]
1564 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1565 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm2
1566 ; X86-SSE-NEXT: movdqa %xmm0, %xmm3
1567 ; X86-SSE-NEXT: psrlq $32, %xmm3
1568 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm3
1569 ; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1570 ; X86-SSE-NEXT: paddq %xmm3, %xmm0
1571 ; X86-SSE-NEXT: psllq $32, %xmm0
1572 ; X86-SSE-NEXT: paddq %xmm2, %xmm0
1573 ; X86-SSE-NEXT: retl
1575 ; X64-SSE-LABEL: mul_v2i64_15_neg_63:
1577 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,18446744073709551553]
1578 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2
1579 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm2
1580 ; X64-SSE-NEXT: movdqa %xmm0, %xmm3
1581 ; X64-SSE-NEXT: psrlq $32, %xmm3
1582 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm3
1583 ; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1584 ; X64-SSE-NEXT: paddq %xmm3, %xmm0
1585 ; X64-SSE-NEXT: psllq $32, %xmm0
1586 ; X64-SSE-NEXT: paddq %xmm2, %xmm0
1587 ; X64-SSE-NEXT: retq
1589 ; X64-XOP-LABEL: mul_v2i64_15_neg_63:
1591 ; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [15,18446744073709551553]
1592 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
1593 ; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm3
1594 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
1595 ; X64-XOP-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1596 ; X64-XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0
1597 ; X64-XOP-NEXT: vpsllq $32, %xmm0, %xmm0
1598 ; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1599 ; X64-XOP-NEXT: retq
1601 ; X64-AVX2-LABEL: mul_v2i64_15_neg_63:
1602 ; X64-AVX2: # %bb.0:
1603 ; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,18446744073709551553]
1604 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
1605 ; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3
1606 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
1607 ; X64-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1608 ; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
1609 ; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
1610 ; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1611 ; X64-AVX2-NEXT: retq
1613 ; X64-AVX512DQ-LABEL: mul_v2i64_15_neg_63:
1614 ; X64-AVX512DQ: # %bb.0:
1615 ; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1616 ; X64-AVX512DQ-NEXT: retq
1617 %1 = mul <2 x i64> %a0, <i64 15, i64 -63>
1621 define <4 x i32> @mul_v4i32_0_15_31_7(<4 x i32> %a0) nounwind {
1622 ; X86-SSE2-LABEL: mul_v4i32_0_15_31_7:
1623 ; X86-SSE2: # %bb.0:
1624 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
1625 ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1626 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1627 ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1628 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1629 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1630 ; X86-SSE2-NEXT: retl
1632 ; X86-SSE4-LABEL: mul_v4i32_0_15_31_7:
1633 ; X86-SSE4: # %bb.0:
1634 ; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1635 ; X86-SSE4-NEXT: retl
1637 ; X64-SSE2-LABEL: mul_v4i32_0_15_31_7:
1638 ; X64-SSE2: # %bb.0:
1639 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
1640 ; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1641 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1642 ; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1643 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1644 ; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1645 ; X64-SSE2-NEXT: retq
1647 ; X64-SSE4-LABEL: mul_v4i32_0_15_31_7:
1648 ; X64-SSE4: # %bb.0:
1649 ; X64-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1650 ; X64-SSE4-NEXT: retq
1652 ; X64-AVX-LABEL: mul_v4i32_0_15_31_7:
1654 ; X64-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1655 ; X64-AVX-NEXT: retq
1656 %1 = mul <4 x i32> %a0, <i32 0, i32 15, i32 31, i32 7>
1660 define <8 x i16> @mul_v8i16_0_1_7_15_31_63_127_255(<8 x i16> %a0) nounwind {
1661 ; X86-SSE-LABEL: mul_v8i16_0_1_7_15_31_63_127_255:
1663 ; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1664 ; X86-SSE-NEXT: retl
1666 ; X64-SSE-LABEL: mul_v8i16_0_1_7_15_31_63_127_255:
1668 ; X64-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1669 ; X64-SSE-NEXT: retq
1671 ; X64-AVX-LABEL: mul_v8i16_0_1_7_15_31_63_127_255:
1673 ; X64-AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1674 ; X64-AVX-NEXT: retq
1675 %1 = mul <8 x i16> %a0, <i16 0, i16 1, i16 7, i16 15, i16 31, i16 63, i16 127, i16 255>
1679 define <16 x i8> @mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127(<16 x i8> %a0) nounwind {
1680 ; SSE2-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127:
1682 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1683 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1684 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,1,3,7,15,31,63,127]
1685 ; SSE2-NEXT: pmullw %xmm2, %xmm1
1686 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
1687 ; SSE2-NEXT: pand %xmm3, %xmm1
1688 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1689 ; SSE2-NEXT: pmullw %xmm2, %xmm0
1690 ; SSE2-NEXT: pand %xmm3, %xmm0
1691 ; SSE2-NEXT: packuswb %xmm1, %xmm0
1692 ; SSE2-NEXT: ret{{[l|q]}}
1694 ; SSE4-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127:
1696 ; SSE4-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1697 ; SSE4-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1698 ; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [0,1,3,7,15,31,63,127]
1699 ; SSE4-NEXT: pmullw %xmm2, %xmm0
1700 ; SSE4-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
1701 ; SSE4-NEXT: pand %xmm3, %xmm0
1702 ; SSE4-NEXT: pmullw %xmm2, %xmm1
1703 ; SSE4-NEXT: pand %xmm3, %xmm1
1704 ; SSE4-NEXT: packuswb %xmm0, %xmm1
1705 ; SSE4-NEXT: movdqa %xmm1, %xmm0
1706 ; SSE4-NEXT: ret{{[l|q]}}
1708 ; X64-XOP-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127:
1710 ; X64-XOP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1711 ; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,3,7,15,31,63,127]
1712 ; X64-XOP-NEXT: vpmullw %xmm2, %xmm1, %xmm1
1713 ; X64-XOP-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1714 ; X64-XOP-NEXT: vpmullw %xmm2, %xmm0, %xmm0
1715 ; X64-XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],xmm1[0,2,4,6,8,10,12,14]
1716 ; X64-XOP-NEXT: retq
1718 ; X64-AVX2-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127:
1719 ; X64-AVX2: # %bb.0:
1720 ; X64-AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1721 ; X64-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1722 ; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1723 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1724 ; X64-AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1725 ; X64-AVX2-NEXT: vzeroupper
1726 ; X64-AVX2-NEXT: retq
1728 ; X64-AVX512DQ-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127:
1729 ; X64-AVX512DQ: # %bb.0:
1730 ; X64-AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1731 ; X64-AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1732 ; X64-AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1733 ; X64-AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
1734 ; X64-AVX512DQ-NEXT: vzeroupper
1735 ; X64-AVX512DQ-NEXT: retq
1736 %1 = mul <16 x i8> %a0, <i8 0, i8 1, i8 3, i8 7, i8 15, i8 31, i8 63, i8 127, i8 0, i8 1, i8 3, i8 7, i8 15, i8 31, i8 63, i8 127>
1740 define <2 x i64> @mul_v2i64_68_132(<2 x i64> %x) nounwind {
1741 ; X86-SSE-LABEL: mul_v2i64_68_132:
1743 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [68,0,132,0]
1744 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1745 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm2
1746 ; X86-SSE-NEXT: psrlq $32, %xmm0
1747 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm0
1748 ; X86-SSE-NEXT: psllq $32, %xmm0
1749 ; X86-SSE-NEXT: paddq %xmm2, %xmm0
1750 ; X86-SSE-NEXT: retl
1752 ; X64-SSE-LABEL: mul_v2i64_68_132:
1754 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [68,132]
1755 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2
1756 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm2
1757 ; X64-SSE-NEXT: psrlq $32, %xmm0
1758 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm0
1759 ; X64-SSE-NEXT: psllq $32, %xmm0
1760 ; X64-SSE-NEXT: paddq %xmm2, %xmm0
1761 ; X64-SSE-NEXT: retq
1763 ; X64-XOP-LABEL: mul_v2i64_68_132:
1765 ; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [68,132]
1766 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
1767 ; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm0
1768 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
1769 ; X64-XOP-NEXT: vpsllq $32, %xmm0, %xmm0
1770 ; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1771 ; X64-XOP-NEXT: retq
1773 ; X64-AVX2-LABEL: mul_v2i64_68_132:
1774 ; X64-AVX2: # %bb.0:
1775 ; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [68,132]
1776 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
1777 ; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0
1778 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
1779 ; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
1780 ; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1781 ; X64-AVX2-NEXT: retq
1783 ; X64-AVX512DQ-LABEL: mul_v2i64_68_132:
1784 ; X64-AVX512DQ: # %bb.0:
1785 ; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1786 ; X64-AVX512DQ-NEXT: retq
1787 %mul = mul <2 x i64> %x, <i64 68, i64 132>
1791 define <2 x i64> @mul_v2i64_60_120(<2 x i64> %x) nounwind {
1792 ; X86-SSE-LABEL: mul_v2i64_60_120:
1794 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [60,0,124,0]
1795 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2
1796 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm2
1797 ; X86-SSE-NEXT: psrlq $32, %xmm0
1798 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm0
1799 ; X86-SSE-NEXT: psllq $32, %xmm0
1800 ; X86-SSE-NEXT: paddq %xmm2, %xmm0
1801 ; X86-SSE-NEXT: retl
1803 ; X64-SSE-LABEL: mul_v2i64_60_120:
1805 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [60,124]
1806 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2
1807 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm2
1808 ; X64-SSE-NEXT: psrlq $32, %xmm0
1809 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm0
1810 ; X64-SSE-NEXT: psllq $32, %xmm0
1811 ; X64-SSE-NEXT: paddq %xmm2, %xmm0
1812 ; X64-SSE-NEXT: retq
1814 ; X64-XOP-LABEL: mul_v2i64_60_120:
1816 ; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [60,124]
1817 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
1818 ; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm0
1819 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
1820 ; X64-XOP-NEXT: vpsllq $32, %xmm0, %xmm0
1821 ; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1822 ; X64-XOP-NEXT: retq
1824 ; X64-AVX2-LABEL: mul_v2i64_60_120:
1825 ; X64-AVX2: # %bb.0:
1826 ; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [60,124]
1827 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
1828 ; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0
1829 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
1830 ; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
1831 ; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0
1832 ; X64-AVX2-NEXT: retq
1834 ; X64-AVX512DQ-LABEL: mul_v2i64_60_120:
1835 ; X64-AVX512DQ: # %bb.0:
1836 ; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1837 ; X64-AVX512DQ-NEXT: retq
1838 %mul = mul <2 x i64> %x, <i64 60, i64 124>
1842 ; We unfortunately can't see the zext that lives in the other basic block so we
1843 ; don't know that we only need one pmuludq to compute the full 64 bits. This
1844 ; sort of issue is more likely to occur when there is a loop and one of the
1845 ; multiply inputs is loop invariant.
1846 define <2 x i64> @mul_v2i64_zext_cross_bb(ptr %in, ptr %y) {
1847 ; X86-SSE2-LABEL: mul_v2i64_zext_cross_bb:
1848 ; X86-SSE2: # %bb.0:
1849 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
1850 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
1851 ; X86-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1852 ; X86-SSE2-NEXT: pxor %xmm1, %xmm1
1853 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1854 ; X86-SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
1855 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0,0,1,1]
1856 ; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0
1857 ; X86-SSE2-NEXT: retl
1859 ; X86-SSE4-LABEL: mul_v2i64_zext_cross_bb:
1860 ; X86-SSE4: # %bb.0:
1861 ; X86-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
1862 ; X86-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
1863 ; X86-SSE4-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
1864 ; X86-SSE4-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
1865 ; X86-SSE4-NEXT: pmuludq %xmm1, %xmm0
1866 ; X86-SSE4-NEXT: retl
1868 ; X64-SSE2-LABEL: mul_v2i64_zext_cross_bb:
1869 ; X64-SSE2: # %bb.0:
1870 ; X64-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1871 ; X64-SSE2-NEXT: pxor %xmm1, %xmm1
1872 ; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1873 ; X64-SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
1874 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
1875 ; X64-SSE2-NEXT: pmuludq %xmm1, %xmm0
1876 ; X64-SSE2-NEXT: retq
1878 ; X64-SSE4-LABEL: mul_v2i64_zext_cross_bb:
1879 ; X64-SSE4: # %bb.0:
1880 ; X64-SSE4-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
1881 ; X64-SSE4-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
1882 ; X64-SSE4-NEXT: pmuludq %xmm1, %xmm0
1883 ; X64-SSE4-NEXT: retq
1885 ; X64-AVX-LABEL: mul_v2i64_zext_cross_bb:
1887 ; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
1888 ; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
1889 ; X64-AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
1890 ; X64-AVX-NEXT: retq
1891 %a = load <2 x i32>, ptr %in
1892 %b = zext <2 x i32> %a to <2 x i64>
1896 %c = load <2 x i32>, ptr %y
1897 %d = zext <2 x i32> %c to <2 x i64>
1898 %e = mul <2 x i64> %b, %d
1902 define <4 x i64> @mul_v4i64_zext_cross_bb(ptr %in, ptr %y) {
1903 ; X86-SSE2-LABEL: mul_v4i64_zext_cross_bb:
1904 ; X86-SSE2: # %bb.0:
1905 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
1906 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
1907 ; X86-SSE2-NEXT: movdqa (%ecx), %xmm0
1908 ; X86-SSE2-NEXT: pxor %xmm2, %xmm2
1909 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
1910 ; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1911 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1912 ; X86-SSE2-NEXT: movdqa (%eax), %xmm2
1913 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,1,3,3]
1914 ; X86-SSE2-NEXT: pmuludq %xmm3, %xmm1
1915 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3]
1916 ; X86-SSE2-NEXT: pmuludq %xmm2, %xmm0
1917 ; X86-SSE2-NEXT: retl
1919 ; X86-SSE4-LABEL: mul_v4i64_zext_cross_bb:
1920 ; X86-SSE4: # %bb.0:
1921 ; X86-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
1922 ; X86-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
1923 ; X86-SSE4-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
1924 ; X86-SSE4-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
1925 ; X86-SSE4-NEXT: pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
1926 ; X86-SSE4-NEXT: pmuludq %xmm2, %xmm1
1927 ; X86-SSE4-NEXT: pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
1928 ; X86-SSE4-NEXT: pmuludq %xmm2, %xmm0
1929 ; X86-SSE4-NEXT: retl
1931 ; X64-SSE2-LABEL: mul_v4i64_zext_cross_bb:
1932 ; X64-SSE2: # %bb.0:
1933 ; X64-SSE2-NEXT: movdqa (%rdi), %xmm0
1934 ; X64-SSE2-NEXT: pxor %xmm2, %xmm2
1935 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
1936 ; X64-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1937 ; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1938 ; X64-SSE2-NEXT: movdqa (%rsi), %xmm2
1939 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,1,3,3]
1940 ; X64-SSE2-NEXT: pmuludq %xmm3, %xmm1
1941 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3]
1942 ; X64-SSE2-NEXT: pmuludq %xmm2, %xmm0
1943 ; X64-SSE2-NEXT: retq
1945 ; X64-SSE4-LABEL: mul_v4i64_zext_cross_bb:
1946 ; X64-SSE4: # %bb.0:
1947 ; X64-SSE4-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
1948 ; X64-SSE4-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
1949 ; X64-SSE4-NEXT: pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
1950 ; X64-SSE4-NEXT: pmuludq %xmm2, %xmm1
1951 ; X64-SSE4-NEXT: pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
1952 ; X64-SSE4-NEXT: pmuludq %xmm2, %xmm0
1953 ; X64-SSE4-NEXT: retq
1955 ; X64-XOP-LABEL: mul_v4i64_zext_cross_bb:
1957 ; X64-XOP-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
1958 ; X64-XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
1959 ; X64-XOP-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1960 ; X64-XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
1961 ; X64-XOP-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
1962 ; X64-XOP-NEXT: vextractf128 $1, %ymm0, %xmm3
1963 ; X64-XOP-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
1964 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
1965 ; X64-XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1966 ; X64-XOP-NEXT: retq
1968 ; X64-AVX2-LABEL: mul_v4i64_zext_cross_bb:
1969 ; X64-AVX2: # %bb.0:
1970 ; X64-AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1971 ; X64-AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1972 ; X64-AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
1973 ; X64-AVX2-NEXT: retq
1975 ; X64-AVX512DQ-LABEL: mul_v4i64_zext_cross_bb:
1976 ; X64-AVX512DQ: # %bb.0:
1977 ; X64-AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1978 ; X64-AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1979 ; X64-AVX512DQ-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
1980 ; X64-AVX512DQ-NEXT: retq
1981 %a = load <4 x i32>, ptr %in
1982 %b = zext <4 x i32> %a to <4 x i64>
1986 %c = load <4 x i32>, ptr %y
1987 %d = zext <4 x i32> %c to <4 x i64>
1988 %e = mul <4 x i64> %b, %d
1991 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
1992 ; X64-SSE4-FAST: {{.*}}
1993 ; X64-SSE4-SLOW: {{.*}}