1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
4 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-SLOW
5 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-FAST
6 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
7 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512BW
8 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512DQ
14 define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
15 ; SSE-LABEL: trunc_add_v4i64_v4i32:
17 ; SSE-NEXT: paddq %xmm3, %xmm1
18 ; SSE-NEXT: paddq %xmm2, %xmm0
19 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
22 ; AVX1-LABEL: trunc_add_v4i64_v4i32:
24 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
25 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
26 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
27 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
28 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
29 ; AVX1-NEXT: vzeroupper
32 ; AVX2-SLOW-LABEL: trunc_add_v4i64_v4i32:
34 ; AVX2-SLOW-NEXT: vpaddq %ymm1, %ymm0, %ymm0
35 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
36 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
37 ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
38 ; AVX2-SLOW-NEXT: vzeroupper
39 ; AVX2-SLOW-NEXT: retq
41 ; AVX2-FAST-LABEL: trunc_add_v4i64_v4i32:
43 ; AVX2-FAST-NEXT: vpaddq %ymm1, %ymm0, %ymm0
44 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
45 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
46 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
47 ; AVX2-FAST-NEXT: vzeroupper
48 ; AVX2-FAST-NEXT: retq
50 ; AVX512-LABEL: trunc_add_v4i64_v4i32:
52 ; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0
53 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
54 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
55 ; AVX512-NEXT: vzeroupper
57 %1 = add <4 x i64> %a0, %a1
58 %2 = trunc <4 x i64> %1 to <4 x i32>
62 define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
63 ; SSE-LABEL: trunc_add_v8i64_v8i16:
65 ; SSE-NEXT: paddq %xmm6, %xmm2
66 ; SSE-NEXT: paddq %xmm7, %xmm3
67 ; SSE-NEXT: paddq %xmm4, %xmm0
68 ; SSE-NEXT: paddq %xmm5, %xmm1
69 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
70 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
71 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
72 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
73 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
74 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
75 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
76 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
77 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
78 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
79 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
82 ; AVX1-LABEL: trunc_add_v8i64_v8i16:
84 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4
85 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
86 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
87 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
88 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm2
89 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
90 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
91 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
92 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
93 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
94 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
95 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
96 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
97 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
98 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
99 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
100 ; AVX1-NEXT: vzeroupper
103 ; AVX2-SLOW-LABEL: trunc_add_v8i64_v8i16:
104 ; AVX2-SLOW: # %bb.0:
105 ; AVX2-SLOW-NEXT: vpaddq %ymm3, %ymm1, %ymm1
106 ; AVX2-SLOW-NEXT: vpaddq %ymm2, %ymm0, %ymm0
107 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
108 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
109 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
110 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
111 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
112 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
113 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
114 ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
115 ; AVX2-SLOW-NEXT: vzeroupper
116 ; AVX2-SLOW-NEXT: retq
118 ; AVX2-FAST-LABEL: trunc_add_v8i64_v8i16:
119 ; AVX2-FAST: # %bb.0:
120 ; AVX2-FAST-NEXT: vpaddq %ymm3, %ymm1, %ymm1
121 ; AVX2-FAST-NEXT: vpaddq %ymm2, %ymm0, %ymm0
122 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
123 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
124 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
125 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
126 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
127 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
128 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
129 ; AVX2-FAST-NEXT: vzeroupper
130 ; AVX2-FAST-NEXT: retq
132 ; AVX512-LABEL: trunc_add_v8i64_v8i16:
134 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
135 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
136 ; AVX512-NEXT: vzeroupper
138 %1 = add <8 x i64> %a0, %a1
139 %2 = trunc <8 x i64> %1 to <8 x i16>
143 define <8 x i16> @trunc_add_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
144 ; SSE-LABEL: trunc_add_v8i32_v8i16:
146 ; SSE-NEXT: paddd %xmm2, %xmm0
147 ; SSE-NEXT: paddd %xmm3, %xmm1
148 ; SSE-NEXT: pslld $16, %xmm1
149 ; SSE-NEXT: psrad $16, %xmm1
150 ; SSE-NEXT: pslld $16, %xmm0
151 ; SSE-NEXT: psrad $16, %xmm0
152 ; SSE-NEXT: packssdw %xmm1, %xmm0
155 ; AVX1-LABEL: trunc_add_v8i32_v8i16:
157 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2
158 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
159 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
160 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
161 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
162 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
163 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
164 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
165 ; AVX1-NEXT: vzeroupper
168 ; AVX2-LABEL: trunc_add_v8i32_v8i16:
170 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
171 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
172 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
173 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
174 ; AVX2-NEXT: vzeroupper
177 ; AVX512-LABEL: trunc_add_v8i32_v8i16:
179 ; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
180 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
181 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
182 ; AVX512-NEXT: vzeroupper
184 %1 = add <8 x i32> %a0, %a1
185 %2 = trunc <8 x i32> %1 to <8 x i16>
189 define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
190 ; SSE-LABEL: trunc_add_v16i64_v16i8:
192 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm0
193 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm1
194 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm2
195 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm3
196 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm4
197 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm5
198 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm6
199 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm7
200 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
201 ; SSE-NEXT: pand %xmm8, %xmm7
202 ; SSE-NEXT: pand %xmm8, %xmm6
203 ; SSE-NEXT: packuswb %xmm7, %xmm6
204 ; SSE-NEXT: pand %xmm8, %xmm5
205 ; SSE-NEXT: pand %xmm8, %xmm4
206 ; SSE-NEXT: packuswb %xmm5, %xmm4
207 ; SSE-NEXT: packuswb %xmm6, %xmm4
208 ; SSE-NEXT: pand %xmm8, %xmm3
209 ; SSE-NEXT: pand %xmm8, %xmm2
210 ; SSE-NEXT: packuswb %xmm3, %xmm2
211 ; SSE-NEXT: pand %xmm8, %xmm1
212 ; SSE-NEXT: pand %xmm8, %xmm0
213 ; SSE-NEXT: packuswb %xmm1, %xmm0
214 ; SSE-NEXT: packuswb %xmm2, %xmm0
215 ; SSE-NEXT: packuswb %xmm4, %xmm0
218 ; AVX1-LABEL: trunc_add_v16i64_v16i8:
220 ; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm8
221 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
222 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
223 ; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0
224 ; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm4
225 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
226 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
227 ; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1
228 ; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm5
229 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6
230 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
231 ; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm2
232 ; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm6
233 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7
234 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
235 ; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm3
236 ; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [1.2598673968951787E-321,1.2598673968951787E-321]
237 ; AVX1-NEXT: # xmm7 = mem[0,0]
238 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
239 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
240 ; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3
241 ; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
242 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
243 ; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2
244 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
245 ; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1
246 ; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3
247 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
248 ; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0
249 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3
250 ; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0
251 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
252 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
253 ; AVX1-NEXT: vzeroupper
256 ; AVX2-SLOW-LABEL: trunc_add_v16i64_v16i8:
257 ; AVX2-SLOW: # %bb.0:
258 ; AVX2-SLOW-NEXT: vpaddq %ymm5, %ymm1, %ymm1
259 ; AVX2-SLOW-NEXT: vpaddq %ymm4, %ymm0, %ymm0
260 ; AVX2-SLOW-NEXT: vpaddq %ymm7, %ymm3, %ymm3
261 ; AVX2-SLOW-NEXT: vpaddq %ymm6, %ymm2, %ymm2
262 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
263 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
264 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
265 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
266 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
267 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
268 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
269 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
270 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
271 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
272 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
273 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
274 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
275 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
276 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
277 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
278 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
279 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
280 ; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
281 ; AVX2-SLOW-NEXT: vzeroupper
282 ; AVX2-SLOW-NEXT: retq
284 ; AVX2-FAST-LABEL: trunc_add_v16i64_v16i8:
285 ; AVX2-FAST: # %bb.0:
286 ; AVX2-FAST-NEXT: vpaddq %ymm5, %ymm1, %ymm1
287 ; AVX2-FAST-NEXT: vpaddq %ymm4, %ymm0, %ymm0
288 ; AVX2-FAST-NEXT: vpaddq %ymm7, %ymm3, %ymm3
289 ; AVX2-FAST-NEXT: vpaddq %ymm6, %ymm2, %ymm2
290 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
291 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
292 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
293 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
294 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
295 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
296 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
297 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
298 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2
299 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
300 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
301 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
302 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
303 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
304 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0
305 ; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
306 ; AVX2-FAST-NEXT: vzeroupper
307 ; AVX2-FAST-NEXT: retq
309 ; AVX512-LABEL: trunc_add_v16i64_v16i8:
311 ; AVX512-NEXT: vpaddq %zmm3, %zmm1, %zmm1
312 ; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0
313 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
314 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1
315 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
316 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
317 ; AVX512-NEXT: vzeroupper
319 %1 = add <16 x i64> %a0, %a1
320 %2 = trunc <16 x i64> %1 to <16 x i8>
324 define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
325 ; SSE-LABEL: trunc_add_v16i32_v16i8:
327 ; SSE-NEXT: paddd %xmm4, %xmm0
328 ; SSE-NEXT: paddd %xmm5, %xmm1
329 ; SSE-NEXT: paddd %xmm6, %xmm2
330 ; SSE-NEXT: paddd %xmm7, %xmm3
331 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
332 ; SSE-NEXT: pand %xmm4, %xmm3
333 ; SSE-NEXT: pand %xmm4, %xmm2
334 ; SSE-NEXT: packuswb %xmm3, %xmm2
335 ; SSE-NEXT: pand %xmm4, %xmm1
336 ; SSE-NEXT: pand %xmm4, %xmm0
337 ; SSE-NEXT: packuswb %xmm1, %xmm0
338 ; SSE-NEXT: packuswb %xmm2, %xmm0
341 ; AVX1-LABEL: trunc_add_v16i32_v16i8:
343 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4
344 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
345 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
346 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
347 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm2
348 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
349 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
350 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
351 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
352 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
353 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
354 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
355 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
356 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2
357 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
358 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
359 ; AVX1-NEXT: vzeroupper
362 ; AVX2-LABEL: trunc_add_v16i32_v16i8:
364 ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
365 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1
366 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
367 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
368 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
369 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
370 ; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
371 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
372 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
373 ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
374 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
375 ; AVX2-NEXT: vzeroupper
378 ; AVX512-LABEL: trunc_add_v16i32_v16i8:
380 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
381 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
382 ; AVX512-NEXT: vzeroupper
384 %1 = add <16 x i32> %a0, %a1
385 %2 = trunc <16 x i32> %1 to <16 x i8>
389 define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
390 ; SSE-LABEL: trunc_add_v16i16_v16i8:
392 ; SSE-NEXT: paddw %xmm2, %xmm0
393 ; SSE-NEXT: paddw %xmm3, %xmm1
394 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
395 ; SSE-NEXT: pand %xmm2, %xmm1
396 ; SSE-NEXT: pand %xmm2, %xmm0
397 ; SSE-NEXT: packuswb %xmm1, %xmm0
400 ; AVX1-LABEL: trunc_add_v16i16_v16i8:
402 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2
403 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
404 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
405 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
406 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
407 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
408 ; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1
409 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
410 ; AVX1-NEXT: vzeroupper
413 ; AVX2-LABEL: trunc_add_v16i16_v16i8:
415 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
416 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
417 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
418 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
419 ; AVX2-NEXT: vzeroupper
422 ; AVX512F-LABEL: trunc_add_v16i16_v16i8:
424 ; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm0
425 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
426 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
427 ; AVX512F-NEXT: vzeroupper
430 ; AVX512BW-LABEL: trunc_add_v16i16_v16i8:
432 ; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm0
433 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
434 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
435 ; AVX512BW-NEXT: vzeroupper
436 ; AVX512BW-NEXT: retq
438 ; AVX512DQ-LABEL: trunc_add_v16i16_v16i8:
440 ; AVX512DQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0
441 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
442 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
443 ; AVX512DQ-NEXT: vzeroupper
444 ; AVX512DQ-NEXT: retq
445 %1 = add <16 x i16> %a0, %a1
446 %2 = trunc <16 x i16> %1 to <16 x i8>
450 define <8 x i16> @trunc_add_v8i32_v8i16_sext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
451 ; SSE-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
453 ; SSE-NEXT: pslld $16, %xmm2
454 ; SSE-NEXT: psrad $16, %xmm2
455 ; SSE-NEXT: pslld $16, %xmm1
456 ; SSE-NEXT: psrad $16, %xmm1
457 ; SSE-NEXT: packssdw %xmm2, %xmm1
458 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
459 ; SSE-NEXT: psraw $8, %xmm0
460 ; SSE-NEXT: paddw %xmm1, %xmm0
463 ; AVX1-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
465 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
466 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
467 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
468 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
469 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
470 ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0
471 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
472 ; AVX1-NEXT: vzeroupper
475 ; AVX2-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
477 ; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0
478 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
479 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
480 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
481 ; AVX2-NEXT: vzeroupper
484 ; AVX512-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
486 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
487 ; AVX512-NEXT: vpmovdw %zmm1, %ymm1
488 ; AVX512-NEXT: vpmovsxbw %xmm0, %xmm0
489 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
490 ; AVX512-NEXT: vzeroupper
492 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
493 %2 = sext <8 x i8> %1 to <8 x i32>
494 %3 = add <8 x i32> %2, %a1
495 %4 = trunc <8 x i32> %3 to <8 x i16>
503 define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
504 ; SSE-LABEL: trunc_add_const_v4i64_v4i32:
506 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
507 ; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
510 ; AVX1-LABEL: trunc_add_const_v4i64_v4i32:
512 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
513 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
514 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
515 ; AVX1-NEXT: vzeroupper
518 ; AVX2-SLOW-LABEL: trunc_add_const_v4i64_v4i32:
519 ; AVX2-SLOW: # %bb.0:
520 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
521 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
522 ; AVX2-SLOW-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
523 ; AVX2-SLOW-NEXT: vzeroupper
524 ; AVX2-SLOW-NEXT: retq
526 ; AVX2-FAST-LABEL: trunc_add_const_v4i64_v4i32:
527 ; AVX2-FAST: # %bb.0:
528 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
529 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
530 ; AVX2-FAST-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
531 ; AVX2-FAST-NEXT: vzeroupper
532 ; AVX2-FAST-NEXT: retq
534 ; AVX512-LABEL: trunc_add_const_v4i64_v4i32:
536 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
537 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
538 ; AVX512-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
539 ; AVX512-NEXT: vzeroupper
541 %1 = add <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
542 %2 = trunc <4 x i64> %1 to <4 x i32>
546 define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
547 ; SSE-LABEL: trunc_add_const_v8i64_v8i16:
549 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
550 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
551 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
552 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
553 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
554 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
555 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
556 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
557 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
558 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
559 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
560 ; SSE-NEXT: paddw {{.*}}(%rip), %xmm0
563 ; AVX1-LABEL: trunc_add_const_v8i64_v8i16:
565 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
566 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
567 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
568 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
569 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
570 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
571 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
572 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
573 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
574 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
575 ; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
576 ; AVX1-NEXT: vzeroupper
579 ; AVX2-SLOW-LABEL: trunc_add_const_v8i64_v8i16:
580 ; AVX2-SLOW: # %bb.0:
581 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
582 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
583 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
584 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
585 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
586 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
587 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
588 ; AVX2-SLOW-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
589 ; AVX2-SLOW-NEXT: vzeroupper
590 ; AVX2-SLOW-NEXT: retq
592 ; AVX2-FAST-LABEL: trunc_add_const_v8i64_v8i16:
593 ; AVX2-FAST: # %bb.0:
594 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
595 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
596 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
597 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
598 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
599 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
600 ; AVX2-FAST-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
601 ; AVX2-FAST-NEXT: vzeroupper
602 ; AVX2-FAST-NEXT: retq
604 ; AVX512-LABEL: trunc_add_const_v8i64_v8i16:
606 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
607 ; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
608 ; AVX512-NEXT: vzeroupper
610 %1 = add <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
611 %2 = trunc <8 x i64> %1 to <8 x i16>
615 define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
616 ; SSE-LABEL: trunc_add_const_v8i32_v8i16:
618 ; SSE-NEXT: pslld $16, %xmm1
619 ; SSE-NEXT: psrad $16, %xmm1
620 ; SSE-NEXT: pslld $16, %xmm0
621 ; SSE-NEXT: psrad $16, %xmm0
622 ; SSE-NEXT: packssdw %xmm1, %xmm0
623 ; SSE-NEXT: paddw {{.*}}(%rip), %xmm0
626 ; AVX1-LABEL: trunc_add_const_v8i32_v8i16:
628 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
629 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
630 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
631 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
632 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
633 ; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
634 ; AVX1-NEXT: vzeroupper
637 ; AVX2-LABEL: trunc_add_const_v8i32_v8i16:
639 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
640 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
641 ; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
642 ; AVX2-NEXT: vzeroupper
645 ; AVX512-LABEL: trunc_add_const_v8i32_v8i16:
647 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
648 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
649 ; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
650 ; AVX512-NEXT: vzeroupper
652 %1 = add <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
653 %2 = trunc <8 x i32> %1 to <8 x i16>
657 define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
658 ; SSE-LABEL: trunc_add_const_v16i64_v16i8:
660 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
661 ; SSE-NEXT: pand %xmm8, %xmm7
662 ; SSE-NEXT: pand %xmm8, %xmm6
663 ; SSE-NEXT: packuswb %xmm7, %xmm6
664 ; SSE-NEXT: pand %xmm8, %xmm5
665 ; SSE-NEXT: pand %xmm8, %xmm4
666 ; SSE-NEXT: packuswb %xmm5, %xmm4
667 ; SSE-NEXT: packuswb %xmm6, %xmm4
668 ; SSE-NEXT: pand %xmm8, %xmm3
669 ; SSE-NEXT: pand %xmm8, %xmm2
670 ; SSE-NEXT: packuswb %xmm3, %xmm2
671 ; SSE-NEXT: pand %xmm8, %xmm1
672 ; SSE-NEXT: pand %xmm8, %xmm0
673 ; SSE-NEXT: packuswb %xmm1, %xmm0
674 ; SSE-NEXT: packuswb %xmm2, %xmm0
675 ; SSE-NEXT: packuswb %xmm4, %xmm0
676 ; SSE-NEXT: paddb {{.*}}(%rip), %xmm0
679 ; AVX1-LABEL: trunc_add_const_v16i64_v16i8:
681 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
682 ; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321]
683 ; AVX1-NEXT: # xmm5 = mem[0,0]
684 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
685 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
686 ; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
687 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
688 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
689 ; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2
690 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
691 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
692 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
693 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
694 ; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1
695 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
696 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
697 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
698 ; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0
699 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
700 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
701 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
702 ; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
703 ; AVX1-NEXT: vzeroupper
706 ; AVX2-SLOW-LABEL: trunc_add_const_v16i64_v16i8:
707 ; AVX2-SLOW: # %bb.0:
708 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
709 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
710 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
711 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
712 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
713 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
714 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
715 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
716 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
717 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
718 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
719 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
720 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
721 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
722 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
723 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
724 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
725 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
726 ; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
727 ; AVX2-SLOW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
728 ; AVX2-SLOW-NEXT: vzeroupper
729 ; AVX2-SLOW-NEXT: retq
731 ; AVX2-FAST-LABEL: trunc_add_const_v16i64_v16i8:
732 ; AVX2-FAST: # %bb.0:
733 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
734 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
735 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
736 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
737 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
738 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
739 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
740 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
741 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2
742 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
743 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
744 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
745 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
746 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
747 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0
748 ; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
749 ; AVX2-FAST-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
750 ; AVX2-FAST-NEXT: vzeroupper
751 ; AVX2-FAST-NEXT: retq
753 ; AVX512-LABEL: trunc_add_const_v16i64_v16i8:
755 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
756 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1
757 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
758 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
759 ; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
760 ; AVX512-NEXT: vzeroupper
762 %1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
763 %2 = trunc <16 x i64> %1 to <16 x i8>
767 define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
768 ; SSE-LABEL: trunc_add_const_v16i32_v16i8:
770 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
771 ; SSE-NEXT: pand %xmm4, %xmm3
772 ; SSE-NEXT: pand %xmm4, %xmm2
773 ; SSE-NEXT: packuswb %xmm3, %xmm2
774 ; SSE-NEXT: pand %xmm4, %xmm1
775 ; SSE-NEXT: pand %xmm4, %xmm0
776 ; SSE-NEXT: packuswb %xmm1, %xmm0
777 ; SSE-NEXT: packuswb %xmm2, %xmm0
778 ; SSE-NEXT: paddb {{.*}}(%rip), %xmm0
781 ; AVX1-LABEL: trunc_add_const_v16i32_v16i8:
783 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
784 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
785 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
786 ; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
787 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
788 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
789 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
790 ; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
791 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
792 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
793 ; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
794 ; AVX1-NEXT: vzeroupper
797 ; AVX2-LABEL: trunc_add_const_v16i32_v16i8:
799 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
800 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
801 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
802 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
803 ; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
804 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
805 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
806 ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
807 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
808 ; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
809 ; AVX2-NEXT: vzeroupper
812 ; AVX512-LABEL: trunc_add_const_v16i32_v16i8:
814 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
815 ; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
816 ; AVX512-NEXT: vzeroupper
818 %1 = add <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
819 %2 = trunc <16 x i32> %1 to <16 x i8>
823 define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
824 ; SSE-LABEL: trunc_add_const_v16i16_v16i8:
826 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
827 ; SSE-NEXT: pand %xmm2, %xmm1
828 ; SSE-NEXT: pand %xmm2, %xmm0
829 ; SSE-NEXT: packuswb %xmm1, %xmm0
830 ; SSE-NEXT: paddb {{.*}}(%rip), %xmm0
833 ; AVX1-LABEL: trunc_add_const_v16i16_v16i8:
835 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
836 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
837 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
838 ; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
839 ; AVX1-NEXT: vzeroupper
842 ; AVX2-LABEL: trunc_add_const_v16i16_v16i8:
844 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
845 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
846 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
847 ; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
848 ; AVX2-NEXT: vzeroupper
851 ; AVX512F-LABEL: trunc_add_const_v16i16_v16i8:
853 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
854 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
855 ; AVX512F-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
856 ; AVX512F-NEXT: vzeroupper
859 ; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8:
861 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
862 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
863 ; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
864 ; AVX512BW-NEXT: vzeroupper
865 ; AVX512BW-NEXT: retq
867 ; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8:
869 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
870 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
871 ; AVX512DQ-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
872 ; AVX512DQ-NEXT: vzeroupper
873 ; AVX512DQ-NEXT: retq
874 %1 = add <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
875 %2 = trunc <16 x i16> %1 to <16 x i8>
883 define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
884 ; SSE-LABEL: trunc_sub_v4i64_v4i32:
886 ; SSE-NEXT: psubq %xmm3, %xmm1
887 ; SSE-NEXT: psubq %xmm2, %xmm0
888 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
891 ; AVX1-LABEL: trunc_sub_v4i64_v4i32:
893 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
894 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
895 ; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
896 ; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
897 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
898 ; AVX1-NEXT: vzeroupper
901 ; AVX2-SLOW-LABEL: trunc_sub_v4i64_v4i32:
902 ; AVX2-SLOW: # %bb.0:
903 ; AVX2-SLOW-NEXT: vpsubq %ymm1, %ymm0, %ymm0
904 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
905 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
906 ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
907 ; AVX2-SLOW-NEXT: vzeroupper
908 ; AVX2-SLOW-NEXT: retq
910 ; AVX2-FAST-LABEL: trunc_sub_v4i64_v4i32:
911 ; AVX2-FAST: # %bb.0:
912 ; AVX2-FAST-NEXT: vpsubq %ymm1, %ymm0, %ymm0
913 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
914 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
915 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
916 ; AVX2-FAST-NEXT: vzeroupper
917 ; AVX2-FAST-NEXT: retq
919 ; AVX512-LABEL: trunc_sub_v4i64_v4i32:
921 ; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0
922 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
923 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
924 ; AVX512-NEXT: vzeroupper
926 %1 = sub <4 x i64> %a0, %a1
927 %2 = trunc <4 x i64> %1 to <4 x i32>
931 define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
932 ; SSE-LABEL: trunc_sub_v8i64_v8i16:
934 ; SSE-NEXT: psubq %xmm6, %xmm2
935 ; SSE-NEXT: psubq %xmm7, %xmm3
936 ; SSE-NEXT: psubq %xmm4, %xmm0
937 ; SSE-NEXT: psubq %xmm5, %xmm1
938 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
939 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
940 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
941 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
942 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
943 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
944 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
945 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
946 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
947 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
948 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
951 ; AVX1-LABEL: trunc_sub_v8i64_v8i16:
953 ; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm4
954 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
955 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
956 ; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
957 ; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm2
958 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
959 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
960 ; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1
961 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
962 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
963 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
964 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
965 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
966 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
967 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
968 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
969 ; AVX1-NEXT: vzeroupper
972 ; AVX2-SLOW-LABEL: trunc_sub_v8i64_v8i16:
973 ; AVX2-SLOW: # %bb.0:
974 ; AVX2-SLOW-NEXT: vpsubq %ymm3, %ymm1, %ymm1
975 ; AVX2-SLOW-NEXT: vpsubq %ymm2, %ymm0, %ymm0
976 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
977 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
978 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
979 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
980 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
981 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
982 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
983 ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
984 ; AVX2-SLOW-NEXT: vzeroupper
985 ; AVX2-SLOW-NEXT: retq
987 ; AVX2-FAST-LABEL: trunc_sub_v8i64_v8i16:
988 ; AVX2-FAST: # %bb.0:
989 ; AVX2-FAST-NEXT: vpsubq %ymm3, %ymm1, %ymm1
990 ; AVX2-FAST-NEXT: vpsubq %ymm2, %ymm0, %ymm0
991 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
992 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
993 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
994 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
995 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
996 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
997 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
998 ; AVX2-FAST-NEXT: vzeroupper
999 ; AVX2-FAST-NEXT: retq
1001 ; AVX512-LABEL: trunc_sub_v8i64_v8i16:
1003 ; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0
1004 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
1005 ; AVX512-NEXT: vzeroupper
1007 %1 = sub <8 x i64> %a0, %a1
1008 %2 = trunc <8 x i64> %1 to <8 x i16>
1012 define <8 x i16> @trunc_sub_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
1013 ; SSE-LABEL: trunc_sub_v8i32_v8i16:
1015 ; SSE-NEXT: psubd %xmm2, %xmm0
1016 ; SSE-NEXT: psubd %xmm3, %xmm1
1017 ; SSE-NEXT: pslld $16, %xmm1
1018 ; SSE-NEXT: psrad $16, %xmm1
1019 ; SSE-NEXT: pslld $16, %xmm0
1020 ; SSE-NEXT: psrad $16, %xmm0
1021 ; SSE-NEXT: packssdw %xmm1, %xmm0
1024 ; AVX1-LABEL: trunc_sub_v8i32_v8i16:
1026 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2
1027 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1028 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1029 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
1030 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1031 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
1032 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
1033 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1034 ; AVX1-NEXT: vzeroupper
1037 ; AVX2-LABEL: trunc_sub_v8i32_v8i16:
1039 ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
1040 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1041 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1042 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1043 ; AVX2-NEXT: vzeroupper
1046 ; AVX512-LABEL: trunc_sub_v8i32_v8i16:
1048 ; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0
1049 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
1050 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1051 ; AVX512-NEXT: vzeroupper
1053 %1 = sub <8 x i32> %a0, %a1
1054 %2 = trunc <8 x i32> %1 to <8 x i16>
1058 define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
1059 ; SSE-LABEL: trunc_sub_v16i64_v16i8:
1061 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm0
1062 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm1
1063 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm2
1064 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm3
1065 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm4
1066 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm5
1067 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm6
1068 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm7
1069 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1070 ; SSE-NEXT: pand %xmm8, %xmm7
1071 ; SSE-NEXT: pand %xmm8, %xmm6
1072 ; SSE-NEXT: packuswb %xmm7, %xmm6
1073 ; SSE-NEXT: pand %xmm8, %xmm5
1074 ; SSE-NEXT: pand %xmm8, %xmm4
1075 ; SSE-NEXT: packuswb %xmm5, %xmm4
1076 ; SSE-NEXT: packuswb %xmm6, %xmm4
1077 ; SSE-NEXT: pand %xmm8, %xmm3
1078 ; SSE-NEXT: pand %xmm8, %xmm2
1079 ; SSE-NEXT: packuswb %xmm3, %xmm2
1080 ; SSE-NEXT: pand %xmm8, %xmm1
1081 ; SSE-NEXT: pand %xmm8, %xmm0
1082 ; SSE-NEXT: packuswb %xmm1, %xmm0
1083 ; SSE-NEXT: packuswb %xmm2, %xmm0
1084 ; SSE-NEXT: packuswb %xmm4, %xmm0
1087 ; AVX1-LABEL: trunc_sub_v16i64_v16i8:
1089 ; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm8
1090 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
1091 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1092 ; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm0
1093 ; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm4
1094 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
1095 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1096 ; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm1
1097 ; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm5
1098 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6
1099 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
1100 ; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm2
1101 ; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm6
1102 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7
1103 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
1104 ; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm3
1105 ; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [1.2598673968951787E-321,1.2598673968951787E-321]
1106 ; AVX1-NEXT: # xmm7 = mem[0,0]
1107 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
1108 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
1109 ; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3
1110 ; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
1111 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
1112 ; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2
1113 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1114 ; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1
1115 ; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3
1116 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
1117 ; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0
1118 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3
1119 ; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0
1120 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1121 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1122 ; AVX1-NEXT: vzeroupper
1125 ; AVX2-SLOW-LABEL: trunc_sub_v16i64_v16i8:
1126 ; AVX2-SLOW: # %bb.0:
1127 ; AVX2-SLOW-NEXT: vpsubq %ymm5, %ymm1, %ymm1
1128 ; AVX2-SLOW-NEXT: vpsubq %ymm4, %ymm0, %ymm0
1129 ; AVX2-SLOW-NEXT: vpsubq %ymm7, %ymm3, %ymm3
1130 ; AVX2-SLOW-NEXT: vpsubq %ymm6, %ymm2, %ymm2
1131 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
1132 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
1133 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
1134 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
1135 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
1136 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1137 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
1138 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
1139 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1140 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
1141 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
1142 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1143 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
1144 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1145 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1146 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
1147 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1148 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
1149 ; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1150 ; AVX2-SLOW-NEXT: vzeroupper
1151 ; AVX2-SLOW-NEXT: retq
1153 ; AVX2-FAST-LABEL: trunc_sub_v16i64_v16i8:
1154 ; AVX2-FAST: # %bb.0:
1155 ; AVX2-FAST-NEXT: vpsubq %ymm5, %ymm1, %ymm1
1156 ; AVX2-FAST-NEXT: vpsubq %ymm4, %ymm0, %ymm0
1157 ; AVX2-FAST-NEXT: vpsubq %ymm7, %ymm3, %ymm3
1158 ; AVX2-FAST-NEXT: vpsubq %ymm6, %ymm2, %ymm2
1159 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
1160 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
1161 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
1162 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
1163 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1164 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
1165 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
1166 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
1167 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2
1168 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
1169 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
1170 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1171 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
1172 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1173 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0
1174 ; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1175 ; AVX2-FAST-NEXT: vzeroupper
1176 ; AVX2-FAST-NEXT: retq
1178 ; AVX512-LABEL: trunc_sub_v16i64_v16i8:
1180 ; AVX512-NEXT: vpsubq %zmm3, %zmm1, %zmm1
1181 ; AVX512-NEXT: vpsubq %zmm2, %zmm0, %zmm0
1182 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
1183 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1
1184 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1185 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
1186 ; AVX512-NEXT: vzeroupper
1188 %1 = sub <16 x i64> %a0, %a1
1189 %2 = trunc <16 x i64> %1 to <16 x i8>
1193 define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
1194 ; SSE-LABEL: trunc_sub_v16i32_v16i8:
1196 ; SSE-NEXT: psubd %xmm4, %xmm0
1197 ; SSE-NEXT: psubd %xmm5, %xmm1
1198 ; SSE-NEXT: psubd %xmm6, %xmm2
1199 ; SSE-NEXT: psubd %xmm7, %xmm3
1200 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1201 ; SSE-NEXT: pand %xmm4, %xmm3
1202 ; SSE-NEXT: pand %xmm4, %xmm2
1203 ; SSE-NEXT: packuswb %xmm3, %xmm2
1204 ; SSE-NEXT: pand %xmm4, %xmm1
1205 ; SSE-NEXT: pand %xmm4, %xmm0
1206 ; SSE-NEXT: packuswb %xmm1, %xmm0
1207 ; SSE-NEXT: packuswb %xmm2, %xmm0
1210 ; AVX1-LABEL: trunc_sub_v16i32_v16i8:
1212 ; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm4
1213 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
1214 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1215 ; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0
1216 ; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm2
1217 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
1218 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1219 ; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1
1220 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
1221 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
1222 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
1223 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
1224 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1225 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2
1226 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
1227 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1228 ; AVX1-NEXT: vzeroupper
1231 ; AVX2-LABEL: trunc_sub_v16i32_v16i8:
1233 ; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0
1234 ; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm1
1235 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1236 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
1237 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1238 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
1239 ; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
1240 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
1241 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1242 ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
1243 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1244 ; AVX2-NEXT: vzeroupper
1247 ; AVX512-LABEL: trunc_sub_v16i32_v16i8:
1249 ; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0
1250 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
1251 ; AVX512-NEXT: vzeroupper
1253 %1 = sub <16 x i32> %a0, %a1
1254 %2 = trunc <16 x i32> %1 to <16 x i8>
1258 define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
1259 ; SSE-LABEL: trunc_sub_v16i16_v16i8:
1261 ; SSE-NEXT: psubw %xmm2, %xmm0
1262 ; SSE-NEXT: psubw %xmm3, %xmm1
1263 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1264 ; SSE-NEXT: pand %xmm2, %xmm1
1265 ; SSE-NEXT: pand %xmm2, %xmm0
1266 ; SSE-NEXT: packuswb %xmm1, %xmm0
1269 ; AVX1-LABEL: trunc_sub_v16i16_v16i8:
1271 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2
1272 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1273 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1274 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0
1275 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
1276 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
1277 ; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1
1278 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
1279 ; AVX1-NEXT: vzeroupper
1282 ; AVX2-LABEL: trunc_sub_v16i16_v16i8:
1284 ; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0
1285 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1286 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1287 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1288 ; AVX2-NEXT: vzeroupper
1291 ; AVX512F-LABEL: trunc_sub_v16i16_v16i8:
1293 ; AVX512F-NEXT: vpsubw %ymm1, %ymm0, %ymm0
1294 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1295 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
1296 ; AVX512F-NEXT: vzeroupper
1297 ; AVX512F-NEXT: retq
1299 ; AVX512BW-LABEL: trunc_sub_v16i16_v16i8:
1300 ; AVX512BW: # %bb.0:
1301 ; AVX512BW-NEXT: vpsubw %ymm1, %ymm0, %ymm0
1302 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1303 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1304 ; AVX512BW-NEXT: vzeroupper
1305 ; AVX512BW-NEXT: retq
1307 ; AVX512DQ-LABEL: trunc_sub_v16i16_v16i8:
1308 ; AVX512DQ: # %bb.0:
1309 ; AVX512DQ-NEXT: vpsubw %ymm1, %ymm0, %ymm0
1310 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1311 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
1312 ; AVX512DQ-NEXT: vzeroupper
1313 ; AVX512DQ-NEXT: retq
1314 %1 = sub <16 x i16> %a0, %a1
1315 %2 = trunc <16 x i16> %1 to <16 x i8>
1319 define <16 x i8> @trunc_ext_sub_v16i16_v16i8(<16 x i8> %x, <16 x i8> %y) {
1320 ; SSE-LABEL: trunc_ext_sub_v16i16_v16i8:
1322 ; SSE-NEXT: psubb %xmm1, %xmm0
1325 ; AVX-LABEL: trunc_ext_sub_v16i16_v16i8:
1327 ; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
1329 %a = zext <16 x i8> %x to <16 x i16>
1330 %b = zext <16 x i8> %y to <16 x i16>
1331 %c = sub <16 x i16> %a, %b
1332 %d = trunc <16 x i16> %c to <16 x i8>
1340 define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
1341 ; SSE-LABEL: trunc_sub_const_v4i64_v4i32:
1343 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1344 ; SSE-NEXT: psubd {{.*}}(%rip), %xmm0
1347 ; AVX1-LABEL: trunc_sub_const_v4i64_v4i32:
1349 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1350 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1351 ; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
1352 ; AVX1-NEXT: vzeroupper
1355 ; AVX2-SLOW-LABEL: trunc_sub_const_v4i64_v4i32:
1356 ; AVX2-SLOW: # %bb.0:
1357 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
1358 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1359 ; AVX2-SLOW-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
1360 ; AVX2-SLOW-NEXT: vzeroupper
1361 ; AVX2-SLOW-NEXT: retq
1363 ; AVX2-FAST-LABEL: trunc_sub_const_v4i64_v4i32:
1364 ; AVX2-FAST: # %bb.0:
1365 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
1366 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
1367 ; AVX2-FAST-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
1368 ; AVX2-FAST-NEXT: vzeroupper
1369 ; AVX2-FAST-NEXT: retq
1371 ; AVX512-LABEL: trunc_sub_const_v4i64_v4i32:
1373 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1374 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
1375 ; AVX512-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
1376 ; AVX512-NEXT: vzeroupper
1378 %1 = sub <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
1379 %2 = trunc <4 x i64> %1 to <4 x i32>
1383 define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
1384 ; SSE-LABEL: trunc_sub_const_v8i64_v8i16:
1386 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1387 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1388 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1389 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
1390 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1391 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
1392 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
1393 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
1394 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
1395 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1396 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
1397 ; SSE-NEXT: psubw {{.*}}(%rip), %xmm0
1400 ; AVX1-LABEL: trunc_sub_const_v8i64_v8i16:
1402 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1403 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1404 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
1405 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
1406 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
1407 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1408 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
1409 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
1410 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
1411 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1412 ; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
1413 ; AVX1-NEXT: vzeroupper
1416 ; AVX2-SLOW-LABEL: trunc_sub_const_v8i64_v8i16:
1417 ; AVX2-SLOW: # %bb.0:
1418 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
1419 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1420 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
1421 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1422 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1423 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1424 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1425 ; AVX2-SLOW-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
1426 ; AVX2-SLOW-NEXT: vzeroupper
1427 ; AVX2-SLOW-NEXT: retq
1429 ; AVX2-FAST-LABEL: trunc_sub_const_v8i64_v8i16:
1430 ; AVX2-FAST: # %bb.0:
1431 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
1432 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
1433 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
1434 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1435 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1436 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1437 ; AVX2-FAST-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
1438 ; AVX2-FAST-NEXT: vzeroupper
1439 ; AVX2-FAST-NEXT: retq
1441 ; AVX512-LABEL: trunc_sub_const_v8i64_v8i16:
1443 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
1444 ; AVX512-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
1445 ; AVX512-NEXT: vzeroupper
1447 %1 = sub <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
1448 %2 = trunc <8 x i64> %1 to <8 x i16>
1452 define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
1453 ; SSE-LABEL: trunc_sub_const_v8i32_v8i16:
1455 ; SSE-NEXT: pslld $16, %xmm1
1456 ; SSE-NEXT: psrad $16, %xmm1
1457 ; SSE-NEXT: pslld $16, %xmm0
1458 ; SSE-NEXT: psrad $16, %xmm0
1459 ; SSE-NEXT: packssdw %xmm1, %xmm0
1460 ; SSE-NEXT: psubw {{.*}}(%rip), %xmm0
1463 ; AVX1-LABEL: trunc_sub_const_v8i32_v8i16:
1465 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1466 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1467 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1468 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1469 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1470 ; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
1471 ; AVX1-NEXT: vzeroupper
1474 ; AVX2-LABEL: trunc_sub_const_v8i32_v8i16:
1476 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1477 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1478 ; AVX2-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
1479 ; AVX2-NEXT: vzeroupper
1482 ; AVX512-LABEL: trunc_sub_const_v8i32_v8i16:
1484 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1485 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
1486 ; AVX512-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
1487 ; AVX512-NEXT: vzeroupper
1489 %1 = sub <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1490 %2 = trunc <8 x i32> %1 to <8 x i16>
1494 define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
1495 ; SSE-LABEL: trunc_sub_const_v16i64_v16i8:
1497 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1498 ; SSE-NEXT: pand %xmm8, %xmm7
1499 ; SSE-NEXT: pand %xmm8, %xmm6
1500 ; SSE-NEXT: packuswb %xmm7, %xmm6
1501 ; SSE-NEXT: pand %xmm8, %xmm5
1502 ; SSE-NEXT: pand %xmm8, %xmm4
1503 ; SSE-NEXT: packuswb %xmm5, %xmm4
1504 ; SSE-NEXT: packuswb %xmm6, %xmm4
1505 ; SSE-NEXT: pand %xmm8, %xmm3
1506 ; SSE-NEXT: pand %xmm8, %xmm2
1507 ; SSE-NEXT: packuswb %xmm3, %xmm2
1508 ; SSE-NEXT: pand %xmm8, %xmm1
1509 ; SSE-NEXT: pand %xmm8, %xmm0
1510 ; SSE-NEXT: packuswb %xmm1, %xmm0
1511 ; SSE-NEXT: packuswb %xmm2, %xmm0
1512 ; SSE-NEXT: packuswb %xmm4, %xmm0
1513 ; SSE-NEXT: psubb {{.*}}(%rip), %xmm0
1516 ; AVX1-LABEL: trunc_sub_const_v16i64_v16i8:
1518 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
1519 ; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321]
1520 ; AVX1-NEXT: # xmm5 = mem[0,0]
1521 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
1522 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
1523 ; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
1524 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
1525 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
1526 ; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2
1527 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
1528 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1529 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1530 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
1531 ; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1
1532 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
1533 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1534 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
1535 ; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0
1536 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
1537 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1538 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1539 ; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
1540 ; AVX1-NEXT: vzeroupper
1543 ; AVX2-SLOW-LABEL: trunc_sub_const_v16i64_v16i8:
1544 ; AVX2-SLOW: # %bb.0:
1545 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
1546 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
1547 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
1548 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
1549 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
1550 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1551 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
1552 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
1553 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1554 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
1555 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
1556 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1557 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
1558 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1559 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1560 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
1561 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1562 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
1563 ; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1564 ; AVX2-SLOW-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
1565 ; AVX2-SLOW-NEXT: vzeroupper
1566 ; AVX2-SLOW-NEXT: retq
1568 ; AVX2-FAST-LABEL: trunc_sub_const_v16i64_v16i8:
1569 ; AVX2-FAST: # %bb.0:
1570 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
1571 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
1572 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
1573 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
1574 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1575 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
1576 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
1577 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
1578 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2
1579 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
1580 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
1581 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1582 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
1583 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1584 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0
1585 ; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1586 ; AVX2-FAST-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
1587 ; AVX2-FAST-NEXT: vzeroupper
1588 ; AVX2-FAST-NEXT: retq
1590 ; AVX512-LABEL: trunc_sub_const_v16i64_v16i8:
1592 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
1593 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1
1594 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1595 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
1596 ; AVX512-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
1597 ; AVX512-NEXT: vzeroupper
1599 %1 = sub <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
1600 %2 = trunc <16 x i64> %1 to <16 x i8>
1604 define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
1605 ; SSE-LABEL: trunc_sub_const_v16i32_v16i8:
1607 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1608 ; SSE-NEXT: pand %xmm4, %xmm3
1609 ; SSE-NEXT: pand %xmm4, %xmm2
1610 ; SSE-NEXT: packuswb %xmm3, %xmm2
1611 ; SSE-NEXT: pand %xmm4, %xmm1
1612 ; SSE-NEXT: pand %xmm4, %xmm0
1613 ; SSE-NEXT: packuswb %xmm1, %xmm0
1614 ; SSE-NEXT: packuswb %xmm2, %xmm0
1615 ; SSE-NEXT: psubb {{.*}}(%rip), %xmm0
1618 ; AVX1-LABEL: trunc_sub_const_v16i32_v16i8:
1620 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1621 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
1622 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
1623 ; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
1624 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
1625 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1626 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
1627 ; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
1628 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
1629 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1630 ; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
1631 ; AVX1-NEXT: vzeroupper
1634 ; AVX2-LABEL: trunc_sub_const_v16i32_v16i8:
1636 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1637 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
1638 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1639 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
1640 ; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
1641 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
1642 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1643 ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
1644 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1645 ; AVX2-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
1646 ; AVX2-NEXT: vzeroupper
1649 ; AVX512-LABEL: trunc_sub_const_v16i32_v16i8:
1651 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
1652 ; AVX512-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
1653 ; AVX512-NEXT: vzeroupper
1655 %1 = sub <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1656 %2 = trunc <16 x i32> %1 to <16 x i8>
1660 define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
1661 ; SSE-LABEL: trunc_sub_const_v16i16_v16i8:
1663 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1664 ; SSE-NEXT: pand %xmm2, %xmm1
1665 ; SSE-NEXT: pand %xmm2, %xmm0
1666 ; SSE-NEXT: packuswb %xmm1, %xmm0
1667 ; SSE-NEXT: psubb {{.*}}(%rip), %xmm0
1670 ; AVX1-LABEL: trunc_sub_const_v16i16_v16i8:
1672 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
1673 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1674 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1675 ; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
1676 ; AVX1-NEXT: vzeroupper
1679 ; AVX2-LABEL: trunc_sub_const_v16i16_v16i8:
1681 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1682 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1683 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1684 ; AVX2-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
1685 ; AVX2-NEXT: vzeroupper
1688 ; AVX512F-LABEL: trunc_sub_const_v16i16_v16i8:
1690 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1691 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
1692 ; AVX512F-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
1693 ; AVX512F-NEXT: vzeroupper
1694 ; AVX512F-NEXT: retq
1696 ; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8:
1697 ; AVX512BW: # %bb.0:
1698 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1699 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1700 ; AVX512BW-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
1701 ; AVX512BW-NEXT: vzeroupper
1702 ; AVX512BW-NEXT: retq
1704 ; AVX512DQ-LABEL: trunc_sub_const_v16i16_v16i8:
1705 ; AVX512DQ: # %bb.0:
1706 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1707 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
1708 ; AVX512DQ-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
1709 ; AVX512DQ-NEXT: vzeroupper
1710 ; AVX512DQ-NEXT: retq
1711 %1 = sub <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1712 %2 = trunc <16 x i16> %1 to <16 x i8>
1716 define <16 x i8> @trunc_ext_sub_const_rhs_v16i16_v16i8(<16 x i8> %x) {
1717 ; SSE-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
1719 ; SSE-NEXT: psubb {{.*}}(%rip), %xmm0
1722 ; AVX-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
1724 ; AVX-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
1726 %a = zext <16 x i8> %x to <16 x i16>
1727 %b = sub <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1728 %c = trunc <16 x i16> %b to <16 x i8>
1732 define <16 x i8> @trunc_ext_sub_const_lhs_v16i16_v16i8(<16 x i8> %x) {
1733 ; SSE-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
1735 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1736 ; SSE-NEXT: psubb %xmm0, %xmm1
1737 ; SSE-NEXT: movdqa %xmm1, %xmm0
1740 ; AVX-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
1742 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1743 ; AVX-NEXT: vpsubb %xmm0, %xmm1, %xmm0
1745 %a = zext <16 x i8> %x to <16 x i16>
1746 %b = sub <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %a
1747 %c = trunc <16 x i16> %b to <16 x i8>
1755 define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
1756 ; SSE-LABEL: trunc_mul_v4i64_v4i32:
1758 ; SSE-NEXT: pmuludq %xmm3, %xmm1
1759 ; SSE-NEXT: pmuludq %xmm2, %xmm0
1760 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1763 ; AVX1-LABEL: trunc_mul_v4i64_v4i32:
1765 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1766 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
1767 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1768 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
1769 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1770 ; AVX1-NEXT: vzeroupper
1773 ; AVX2-SLOW-LABEL: trunc_mul_v4i64_v4i32:
1774 ; AVX2-SLOW: # %bb.0:
1775 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
1776 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1777 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
1778 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1779 ; AVX2-SLOW-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1780 ; AVX2-SLOW-NEXT: vzeroupper
1781 ; AVX2-SLOW-NEXT: retq
1783 ; AVX2-FAST-LABEL: trunc_mul_v4i64_v4i32:
1784 ; AVX2-FAST: # %bb.0:
1785 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
1786 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
1787 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
1788 ; AVX2-FAST-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1789 ; AVX2-FAST-NEXT: vzeroupper
1790 ; AVX2-FAST-NEXT: retq
1792 ; AVX512F-LABEL: trunc_mul_v4i64_v4i32:
1794 ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1795 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1796 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
1797 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
1798 ; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1799 ; AVX512F-NEXT: vzeroupper
1800 ; AVX512F-NEXT: retq
1802 ; AVX512BW-LABEL: trunc_mul_v4i64_v4i32:
1803 ; AVX512BW: # %bb.0:
1804 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1805 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1806 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
1807 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
1808 ; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1809 ; AVX512BW-NEXT: vzeroupper
1810 ; AVX512BW-NEXT: retq
1812 ; AVX512DQ-LABEL: trunc_mul_v4i64_v4i32:
1813 ; AVX512DQ: # %bb.0:
1814 ; AVX512DQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1815 ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1816 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
1817 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
1818 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1819 ; AVX512DQ-NEXT: vzeroupper
1820 ; AVX512DQ-NEXT: retq
1821 %1 = mul <4 x i64> %a0, %a1
1822 %2 = trunc <4 x i64> %1 to <4 x i32>
1826 define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
1827 ; SSE-LABEL: trunc_mul_v8i64_v8i16:
1829 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
1830 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
1831 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1832 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
1833 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
1834 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3]
1835 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,0,2,4,5,6,7]
1836 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
1837 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,2,4,5,6,7]
1838 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
1839 ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1]
1840 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1841 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1842 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1843 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
1844 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1845 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
1846 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
1847 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
1848 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
1849 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1850 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
1851 ; SSE-NEXT: pmullw %xmm6, %xmm0
1854 ; AVX1-LABEL: trunc_mul_v8i64_v8i16:
1856 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
1857 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
1858 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6,7]
1859 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7]
1860 ; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
1861 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
1862 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6,7]
1863 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1,2,3],xmm2[4],xmm5[5,6,7]
1864 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
1865 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1866 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1867 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7]
1868 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1,2,3],xmm1[4],xmm5[5,6,7]
1869 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
1870 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1871 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7]
1872 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1,2,3],xmm0[4],xmm5[5,6,7]
1873 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
1874 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1875 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
1876 ; AVX1-NEXT: vzeroupper
1879 ; AVX2-SLOW-LABEL: trunc_mul_v8i64_v8i16:
1880 ; AVX2-SLOW: # %bb.0:
1881 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
1882 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
1883 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
1884 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
1885 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
1886 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1887 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
1888 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
1889 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
1890 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1891 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
1892 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1893 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1894 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
1895 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1896 ; AVX2-SLOW-NEXT: vpmullw %xmm2, %xmm0, %xmm0
1897 ; AVX2-SLOW-NEXT: vzeroupper
1898 ; AVX2-SLOW-NEXT: retq
1900 ; AVX2-FAST-LABEL: trunc_mul_v8i64_v8i16:
1901 ; AVX2-FAST: # %bb.0:
1902 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
1903 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
1904 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
1905 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
1906 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1907 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
1908 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
1909 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
1910 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
1911 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1912 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
1913 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1914 ; AVX2-FAST-NEXT: vpmullw %xmm2, %xmm0, %xmm0
1915 ; AVX2-FAST-NEXT: vzeroupper
1916 ; AVX2-FAST-NEXT: retq
1918 ; AVX512F-LABEL: trunc_mul_v8i64_v8i16:
1920 ; AVX512F-NEXT: vpmovqw %zmm1, %xmm1
1921 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
1922 ; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1923 ; AVX512F-NEXT: vzeroupper
1924 ; AVX512F-NEXT: retq
1926 ; AVX512BW-LABEL: trunc_mul_v8i64_v8i16:
1927 ; AVX512BW: # %bb.0:
1928 ; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1
1929 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
1930 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1931 ; AVX512BW-NEXT: vzeroupper
1932 ; AVX512BW-NEXT: retq
1934 ; AVX512DQ-LABEL: trunc_mul_v8i64_v8i16:
1935 ; AVX512DQ: # %bb.0:
1936 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
1937 ; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0
1938 ; AVX512DQ-NEXT: vzeroupper
1939 ; AVX512DQ-NEXT: retq
1940 %1 = mul <8 x i64> %a0, %a1
1941 %2 = trunc <8 x i64> %1 to <8 x i16>
1945 define <8 x i16> @trunc_mul_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
1946 ; SSE-LABEL: trunc_mul_v8i32_v8i16:
1948 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
1949 ; SSE-NEXT: pmuludq %xmm2, %xmm0
1950 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1951 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1952 ; SSE-NEXT: pmuludq %xmm4, %xmm2
1953 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1954 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1955 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1956 ; SSE-NEXT: pmuludq %xmm3, %xmm1
1957 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1958 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1959 ; SSE-NEXT: pmuludq %xmm2, %xmm3
1960 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
1961 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1962 ; SSE-NEXT: pslld $16, %xmm1
1963 ; SSE-NEXT: psrad $16, %xmm1
1964 ; SSE-NEXT: pslld $16, %xmm0
1965 ; SSE-NEXT: psrad $16, %xmm0
1966 ; SSE-NEXT: packssdw %xmm1, %xmm0
1969 ; AVX1-LABEL: trunc_mul_v8i32_v8i16:
1971 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm2
1972 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1973 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1974 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1975 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1976 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
1977 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
1978 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1979 ; AVX1-NEXT: vzeroupper
1982 ; AVX2-LABEL: trunc_mul_v8i32_v8i16:
1984 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
1985 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1986 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1987 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1988 ; AVX2-NEXT: vzeroupper
1991 ; AVX512-LABEL: trunc_mul_v8i32_v8i16:
1993 ; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0
1994 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
1995 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1996 ; AVX512-NEXT: vzeroupper
1998 %1 = mul <8 x i32> %a0, %a1
1999 %2 = trunc <8 x i32> %1 to <8 x i16>
2003 define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
2004 ; SSE-LABEL: trunc_mul_v16i64_v16i8:
2006 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm0
2007 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm1
2008 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm2
2009 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm3
2010 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm4
2011 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm5
2012 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm6
2013 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm7
2014 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
2015 ; SSE-NEXT: pand %xmm8, %xmm7
2016 ; SSE-NEXT: pand %xmm8, %xmm6
2017 ; SSE-NEXT: packuswb %xmm7, %xmm6
2018 ; SSE-NEXT: pand %xmm8, %xmm5
2019 ; SSE-NEXT: pand %xmm8, %xmm4
2020 ; SSE-NEXT: packuswb %xmm5, %xmm4
2021 ; SSE-NEXT: packuswb %xmm6, %xmm4
2022 ; SSE-NEXT: pand %xmm8, %xmm3
2023 ; SSE-NEXT: pand %xmm8, %xmm2
2024 ; SSE-NEXT: packuswb %xmm3, %xmm2
2025 ; SSE-NEXT: pand %xmm8, %xmm1
2026 ; SSE-NEXT: pand %xmm8, %xmm0
2027 ; SSE-NEXT: packuswb %xmm1, %xmm0
2028 ; SSE-NEXT: packuswb %xmm2, %xmm0
2029 ; SSE-NEXT: packuswb %xmm4, %xmm0
2032 ; AVX1-LABEL: trunc_mul_v16i64_v16i8:
2034 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm8
2035 ; AVX1-NEXT: vpmuludq %xmm4, %xmm8, %xmm8
2036 ; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm9
2037 ; AVX1-NEXT: vpmuludq %xmm9, %xmm0, %xmm9
2038 ; AVX1-NEXT: vpaddq %xmm8, %xmm9, %xmm8
2039 ; AVX1-NEXT: vpsllq $32, %xmm8, %xmm8
2040 ; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm9
2041 ; AVX1-NEXT: vpaddq %xmm8, %xmm9, %xmm8
2042 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm9
2043 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2044 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm4
2045 ; AVX1-NEXT: vpmuludq %xmm9, %xmm4, %xmm10
2046 ; AVX1-NEXT: vpsrlq $32, %xmm9, %xmm4
2047 ; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm4
2048 ; AVX1-NEXT: vpaddq %xmm10, %xmm4, %xmm4
2049 ; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
2050 ; AVX1-NEXT: vpmuludq %xmm9, %xmm0, %xmm0
2051 ; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm9
2052 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
2053 ; AVX1-NEXT: vpmuludq %xmm5, %xmm4, %xmm4
2054 ; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm0
2055 ; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
2056 ; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0
2057 ; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
2058 ; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm4
2059 ; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm10
2060 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm0
2061 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2062 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5
2063 ; AVX1-NEXT: vpmuludq %xmm0, %xmm5, %xmm5
2064 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm4
2065 ; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm4
2066 ; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm4
2067 ; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
2068 ; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
2069 ; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm1
2070 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm0
2071 ; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm0
2072 ; AVX1-NEXT: vpsrlq $32, %xmm6, %xmm4
2073 ; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm4
2074 ; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
2075 ; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
2076 ; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm4
2077 ; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm5
2078 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm0
2079 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
2080 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm4
2081 ; AVX1-NEXT: vpmuludq %xmm0, %xmm4, %xmm4
2082 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm6
2083 ; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm6
2084 ; AVX1-NEXT: vpaddq %xmm4, %xmm6, %xmm4
2085 ; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
2086 ; AVX1-NEXT: vpmuludq %xmm0, %xmm2, %xmm0
2087 ; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0
2088 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm2
2089 ; AVX1-NEXT: vpmuludq %xmm7, %xmm2, %xmm2
2090 ; AVX1-NEXT: vpsrlq $32, %xmm7, %xmm4
2091 ; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm4
2092 ; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2
2093 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
2094 ; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm4
2095 ; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2
2096 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm4
2097 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
2098 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm6
2099 ; AVX1-NEXT: vpmuludq %xmm4, %xmm6, %xmm6
2100 ; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm7
2101 ; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm7
2102 ; AVX1-NEXT: vpaddq %xmm6, %xmm7, %xmm6
2103 ; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6
2104 ; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm3
2105 ; AVX1-NEXT: vpaddq %xmm6, %xmm3, %xmm3
2106 ; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [1.2598673968951787E-321,1.2598673968951787E-321]
2107 ; AVX1-NEXT: # xmm4 = mem[0,0]
2108 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
2109 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
2110 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
2111 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
2112 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3
2113 ; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0
2114 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
2115 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
2116 ; AVX1-NEXT: vpand %xmm4, %xmm10, %xmm2
2117 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
2118 ; AVX1-NEXT: vpand %xmm4, %xmm9, %xmm2
2119 ; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3
2120 ; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2
2121 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
2122 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
2123 ; AVX1-NEXT: vzeroupper
2126 ; AVX2-SLOW-LABEL: trunc_mul_v16i64_v16i8:
2127 ; AVX2-SLOW: # %bb.0:
2128 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7]
2129 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
2130 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
2131 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
2132 ; AVX2-SLOW-NEXT: vpmulld %xmm7, %xmm3, %xmm3
2133 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7]
2134 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
2135 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
2136 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
2137 ; AVX2-SLOW-NEXT: vpmulld %xmm6, %xmm2, %xmm2
2138 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
2139 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2140 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
2141 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
2142 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
2143 ; AVX2-SLOW-NEXT: vpand %xmm6, %xmm2, %xmm2
2144 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7]
2145 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
2146 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
2147 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2148 ; AVX2-SLOW-NEXT: vpmulld %xmm5, %xmm1, %xmm1
2149 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
2150 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
2151 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
2152 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2153 ; AVX2-SLOW-NEXT: vpmulld %xmm4, %xmm0, %xmm0
2154 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2155 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
2156 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2157 ; AVX2-SLOW-NEXT: vpand %xmm6, %xmm0, %xmm0
2158 ; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2159 ; AVX2-SLOW-NEXT: vzeroupper
2160 ; AVX2-SLOW-NEXT: retq
2162 ; AVX2-FAST-LABEL: trunc_mul_v16i64_v16i8:
2163 ; AVX2-FAST: # %bb.0:
2164 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,2,4,6,4,6,6,7]
2165 ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm8, %ymm7
2166 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm3
2167 ; AVX2-FAST-NEXT: vpmulld %xmm7, %xmm3, %xmm3
2168 ; AVX2-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm6
2169 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm8, %ymm2
2170 ; AVX2-FAST-NEXT: vpmulld %xmm6, %xmm2, %xmm2
2171 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
2172 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2173 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
2174 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
2175 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
2176 ; AVX2-FAST-NEXT: vpand %xmm6, %xmm2, %xmm2
2177 ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm5
2178 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm8, %ymm1
2179 ; AVX2-FAST-NEXT: vpmulld %xmm5, %xmm1, %xmm1
2180 ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm4
2181 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm0
2182 ; AVX2-FAST-NEXT: vpmulld %xmm4, %xmm0, %xmm0
2183 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2184 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
2185 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2186 ; AVX2-FAST-NEXT: vpand %xmm6, %xmm0, %xmm0
2187 ; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2188 ; AVX2-FAST-NEXT: vzeroupper
2189 ; AVX2-FAST-NEXT: retq
2191 ; AVX512F-LABEL: trunc_mul_v16i64_v16i8:
2193 ; AVX512F-NEXT: vpmovqd %zmm3, %ymm3
2194 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
2195 ; AVX512F-NEXT: vpmulld %ymm3, %ymm1, %ymm1
2196 ; AVX512F-NEXT: vpmovqd %zmm2, %ymm2
2197 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
2198 ; AVX512F-NEXT: vpmulld %ymm2, %ymm0, %ymm0
2199 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2200 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
2201 ; AVX512F-NEXT: vzeroupper
2202 ; AVX512F-NEXT: retq
2204 ; AVX512BW-LABEL: trunc_mul_v16i64_v16i8:
2205 ; AVX512BW: # %bb.0:
2206 ; AVX512BW-NEXT: vpmovqd %zmm3, %ymm3
2207 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
2208 ; AVX512BW-NEXT: vpmulld %ymm3, %ymm1, %ymm1
2209 ; AVX512BW-NEXT: vpmovqd %zmm2, %ymm2
2210 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
2211 ; AVX512BW-NEXT: vpmulld %ymm2, %ymm0, %ymm0
2212 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2213 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
2214 ; AVX512BW-NEXT: vzeroupper
2215 ; AVX512BW-NEXT: retq
2217 ; AVX512DQ-LABEL: trunc_mul_v16i64_v16i8:
2218 ; AVX512DQ: # %bb.0:
2219 ; AVX512DQ-NEXT: vpmullq %zmm3, %zmm1, %zmm1
2220 ; AVX512DQ-NEXT: vpmullq %zmm2, %zmm0, %zmm0
2221 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
2222 ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
2223 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2224 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
2225 ; AVX512DQ-NEXT: vzeroupper
2226 ; AVX512DQ-NEXT: retq
2227 %1 = mul <16 x i64> %a0, %a1
2228 %2 = trunc <16 x i64> %1 to <16 x i8>
2232 define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
2233 ; SSE-LABEL: trunc_mul_v16i32_v16i8:
2235 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
2236 ; SSE-NEXT: pmuludq %xmm4, %xmm0
2237 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2238 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2239 ; SSE-NEXT: pmuludq %xmm8, %xmm4
2240 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2241 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
2242 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
2243 ; SSE-NEXT: pmuludq %xmm5, %xmm1
2244 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2245 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
2246 ; SSE-NEXT: pmuludq %xmm4, %xmm5
2247 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
2248 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
2249 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
2250 ; SSE-NEXT: pmuludq %xmm6, %xmm2
2251 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2252 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
2253 ; SSE-NEXT: pmuludq %xmm4, %xmm5
2254 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
2255 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
2256 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
2257 ; SSE-NEXT: pmuludq %xmm7, %xmm3
2258 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
2259 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
2260 ; SSE-NEXT: pmuludq %xmm4, %xmm5
2261 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
2262 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
2263 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
2264 ; SSE-NEXT: pand %xmm4, %xmm3
2265 ; SSE-NEXT: pand %xmm4, %xmm2
2266 ; SSE-NEXT: packuswb %xmm3, %xmm2
2267 ; SSE-NEXT: pand %xmm4, %xmm1
2268 ; SSE-NEXT: pand %xmm4, %xmm0
2269 ; SSE-NEXT: packuswb %xmm1, %xmm0
2270 ; SSE-NEXT: packuswb %xmm2, %xmm0
2273 ; AVX1-LABEL: trunc_mul_v16i32_v16i8:
2275 ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm4
2276 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
2277 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2278 ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
2279 ; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm2
2280 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
2281 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2282 ; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1
2283 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
2284 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
2285 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
2286 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
2287 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
2288 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2
2289 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
2290 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2291 ; AVX1-NEXT: vzeroupper
2294 ; AVX2-LABEL: trunc_mul_v16i32_v16i8:
2296 ; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0
2297 ; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1
2298 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2299 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
2300 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2301 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2302 ; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
2303 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
2304 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2305 ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
2306 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2307 ; AVX2-NEXT: vzeroupper
2310 ; AVX512-LABEL: trunc_mul_v16i32_v16i8:
2312 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
2313 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
2314 ; AVX512-NEXT: vzeroupper
2316 %1 = mul <16 x i32> %a0, %a1
2317 %2 = trunc <16 x i32> %1 to <16 x i8>
2321 define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
2322 ; SSE-LABEL: trunc_mul_v16i16_v16i8:
2324 ; SSE-NEXT: pmullw %xmm2, %xmm0
2325 ; SSE-NEXT: pmullw %xmm3, %xmm1
2326 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
2327 ; SSE-NEXT: pand %xmm2, %xmm1
2328 ; SSE-NEXT: pand %xmm2, %xmm0
2329 ; SSE-NEXT: packuswb %xmm1, %xmm0
2332 ; AVX1-LABEL: trunc_mul_v16i16_v16i8:
2334 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm2
2335 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2336 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2337 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
2338 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
2339 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
2340 ; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1
2341 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
2342 ; AVX1-NEXT: vzeroupper
2345 ; AVX2-LABEL: trunc_mul_v16i16_v16i8:
2347 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
2348 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
2349 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2350 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2351 ; AVX2-NEXT: vzeroupper
2354 ; AVX512F-LABEL: trunc_mul_v16i16_v16i8:
2356 ; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0
2357 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2358 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
2359 ; AVX512F-NEXT: vzeroupper
2360 ; AVX512F-NEXT: retq
2362 ; AVX512BW-LABEL: trunc_mul_v16i16_v16i8:
2363 ; AVX512BW: # %bb.0:
2364 ; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0
2365 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
2366 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2367 ; AVX512BW-NEXT: vzeroupper
2368 ; AVX512BW-NEXT: retq
2370 ; AVX512DQ-LABEL: trunc_mul_v16i16_v16i8:
2371 ; AVX512DQ: # %bb.0:
2372 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
2373 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2374 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
2375 ; AVX512DQ-NEXT: vzeroupper
2376 ; AVX512DQ-NEXT: retq
2377 %1 = mul <16 x i16> %a0, %a1
2378 %2 = trunc <16 x i16> %1 to <16 x i8>
2382 define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
2383 ; SSE-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2385 ; SSE-NEXT: pxor %xmm3, %xmm3
2386 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2387 ; SSE-NEXT: pslld $16, %xmm2
2388 ; SSE-NEXT: psrad $16, %xmm2
2389 ; SSE-NEXT: pslld $16, %xmm1
2390 ; SSE-NEXT: psrad $16, %xmm1
2391 ; SSE-NEXT: packssdw %xmm2, %xmm1
2392 ; SSE-NEXT: pmullw %xmm1, %xmm0
2395 ; AVX1-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2397 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2398 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2399 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
2400 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
2401 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2402 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2403 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
2404 ; AVX1-NEXT: vzeroupper
2407 ; AVX2-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2409 ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2410 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2411 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2412 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
2413 ; AVX2-NEXT: vzeroupper
2416 ; AVX512-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2418 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
2419 ; AVX512-NEXT: vpmovdw %zmm1, %ymm1
2420 ; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2421 ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
2422 ; AVX512-NEXT: vzeroupper
2424 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2425 %2 = zext <8 x i8> %1 to <8 x i32>
2426 %3 = mul <8 x i32> %2, %a1
2427 %4 = trunc <8 x i32> %3 to <8 x i16>
2435 define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
2436 ; SSE-LABEL: trunc_mul_const_v4i64_v4i32:
2438 ; SSE-NEXT: movl $1, %eax
2439 ; SSE-NEXT: movq %rax, %xmm2
2440 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
2441 ; SSE-NEXT: pmuludq %xmm2, %xmm0
2442 ; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm1
2443 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2446 ; AVX1-LABEL: trunc_mul_const_v4i64_v4i32:
2448 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2449 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2450 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
2451 ; AVX1-NEXT: vzeroupper
2454 ; AVX2-SLOW-LABEL: trunc_mul_const_v4i64_v4i32:
2455 ; AVX2-SLOW: # %bb.0:
2456 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
2457 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2458 ; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
2459 ; AVX2-SLOW-NEXT: vzeroupper
2460 ; AVX2-SLOW-NEXT: retq
2462 ; AVX2-FAST-LABEL: trunc_mul_const_v4i64_v4i32:
2463 ; AVX2-FAST: # %bb.0:
2464 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
2465 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
2466 ; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
2467 ; AVX2-FAST-NEXT: vzeroupper
2468 ; AVX2-FAST-NEXT: retq
2470 ; AVX512-LABEL: trunc_mul_const_v4i64_v4i32:
2472 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
2473 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
2474 ; AVX512-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
2475 ; AVX512-NEXT: vzeroupper
2477 %1 = mul <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
2478 %2 = trunc <4 x i64> %1 to <4 x i32>
2482 define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
2483 ; SSE-LABEL: trunc_mul_const_v8i64_v8i16:
2485 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2486 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
2487 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2488 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
2489 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
2490 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
2491 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
2492 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
2493 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
2494 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2495 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
2496 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
2499 ; AVX1-LABEL: trunc_mul_const_v8i64_v8i16:
2501 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2502 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
2503 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
2504 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
2505 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
2506 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2507 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
2508 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
2509 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
2510 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2511 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
2512 ; AVX1-NEXT: vzeroupper
2515 ; AVX2-SLOW-LABEL: trunc_mul_const_v8i64_v8i16:
2516 ; AVX2-SLOW: # %bb.0:
2517 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
2518 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2519 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
2520 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2521 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2522 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2523 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2524 ; AVX2-SLOW-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
2525 ; AVX2-SLOW-NEXT: vzeroupper
2526 ; AVX2-SLOW-NEXT: retq
2528 ; AVX2-FAST-LABEL: trunc_mul_const_v8i64_v8i16:
2529 ; AVX2-FAST: # %bb.0:
2530 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
2531 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
2532 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
2533 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2534 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2535 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2536 ; AVX2-FAST-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
2537 ; AVX2-FAST-NEXT: vzeroupper
2538 ; AVX2-FAST-NEXT: retq
2540 ; AVX512-LABEL: trunc_mul_const_v8i64_v8i16:
2542 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
2543 ; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
2544 ; AVX512-NEXT: vzeroupper
2546 %1 = mul <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
2547 %2 = trunc <8 x i64> %1 to <8 x i16>
2551 define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
2552 ; SSE-LABEL: trunc_mul_const_v8i32_v8i16:
2554 ; SSE-NEXT: pslld $16, %xmm1
2555 ; SSE-NEXT: psrad $16, %xmm1
2556 ; SSE-NEXT: pslld $16, %xmm0
2557 ; SSE-NEXT: psrad $16, %xmm0
2558 ; SSE-NEXT: packssdw %xmm1, %xmm0
2559 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
2562 ; AVX1-LABEL: trunc_mul_const_v8i32_v8i16:
2564 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2565 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2566 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
2567 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2568 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2569 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
2570 ; AVX1-NEXT: vzeroupper
2573 ; AVX2-LABEL: trunc_mul_const_v8i32_v8i16:
2575 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2576 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2577 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
2578 ; AVX2-NEXT: vzeroupper
2581 ; AVX512-LABEL: trunc_mul_const_v8i32_v8i16:
2583 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
2584 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
2585 ; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
2586 ; AVX512-NEXT: vzeroupper
2588 %1 = mul <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2589 %2 = trunc <8 x i32> %1 to <8 x i16>
2593 define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
2594 ; SSE-LABEL: trunc_mul_const_v16i64_v16i8:
2596 ; SSE-NEXT: movl $1, %eax
2597 ; SSE-NEXT: movq %rax, %xmm8
2598 ; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
2599 ; SSE-NEXT: pmuludq %xmm8, %xmm0
2600 ; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm1
2601 ; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm2
2602 ; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm3
2603 ; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm4
2604 ; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm5
2605 ; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm6
2606 ; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm7
2607 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
2608 ; SSE-NEXT: pand %xmm8, %xmm7
2609 ; SSE-NEXT: pand %xmm8, %xmm6
2610 ; SSE-NEXT: packuswb %xmm7, %xmm6
2611 ; SSE-NEXT: pand %xmm8, %xmm5
2612 ; SSE-NEXT: pand %xmm8, %xmm4
2613 ; SSE-NEXT: packuswb %xmm5, %xmm4
2614 ; SSE-NEXT: packuswb %xmm6, %xmm4
2615 ; SSE-NEXT: pand %xmm8, %xmm3
2616 ; SSE-NEXT: pand %xmm8, %xmm2
2617 ; SSE-NEXT: packuswb %xmm3, %xmm2
2618 ; SSE-NEXT: pand %xmm8, %xmm1
2619 ; SSE-NEXT: pand %xmm8, %xmm0
2620 ; SSE-NEXT: packuswb %xmm1, %xmm0
2621 ; SSE-NEXT: packuswb %xmm2, %xmm0
2622 ; SSE-NEXT: packuswb %xmm4, %xmm0
2625 ; AVX1-LABEL: trunc_mul_const_v16i64_v16i8:
2627 ; AVX1-NEXT: movl $1, %eax
2628 ; AVX1-NEXT: vmovq %rax, %xmm4
2629 ; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
2630 ; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm5
2631 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm6
2632 ; AVX1-NEXT: vpmuludq %xmm4, %xmm6, %xmm4
2633 ; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
2634 ; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm8
2635 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2636 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3]
2637 ; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm6
2638 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
2639 ; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm0
2640 ; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
2641 ; AVX1-NEXT: vpaddq %xmm0, %xmm6, %xmm9
2642 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5]
2643 ; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm6
2644 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm7
2645 ; AVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5
2646 ; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
2647 ; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5
2648 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2649 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7]
2650 ; AVX1-NEXT: vpmuludq %xmm6, %xmm1, %xmm7
2651 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
2652 ; AVX1-NEXT: vpmuludq %xmm6, %xmm1, %xmm1
2653 ; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
2654 ; AVX1-NEXT: vpaddq %xmm1, %xmm7, %xmm1
2655 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9]
2656 ; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm7
2657 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm4
2658 ; AVX1-NEXT: vpmuludq %xmm6, %xmm4, %xmm4
2659 ; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
2660 ; AVX1-NEXT: vpaddq %xmm4, %xmm7, %xmm4
2661 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
2662 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [10,11]
2663 ; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm7
2664 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2
2665 ; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm2
2666 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
2667 ; AVX1-NEXT: vpaddq %xmm2, %xmm7, %xmm2
2668 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [12,13]
2669 ; AVX1-NEXT: vpmuludq %xmm6, %xmm3, %xmm7
2670 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm0
2671 ; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm0
2672 ; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
2673 ; AVX1-NEXT: vpaddq %xmm0, %xmm7, %xmm0
2674 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
2675 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [14,15]
2676 ; AVX1-NEXT: vpmuludq %xmm6, %xmm3, %xmm7
2677 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm3
2678 ; AVX1-NEXT: vpmuludq %xmm6, %xmm3, %xmm3
2679 ; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3
2680 ; AVX1-NEXT: vpaddq %xmm3, %xmm7, %xmm3
2681 ; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [1.2598673968951787E-321,1.2598673968951787E-321]
2682 ; AVX1-NEXT: # xmm6 = mem[0,0]
2683 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
2684 ; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0
2685 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
2686 ; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2
2687 ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm3
2688 ; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2
2689 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
2690 ; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1
2691 ; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm2
2692 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
2693 ; AVX1-NEXT: vpand %xmm6, %xmm9, %xmm2
2694 ; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm3
2695 ; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2
2696 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
2697 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
2698 ; AVX1-NEXT: vzeroupper
2701 ; AVX2-SLOW-LABEL: trunc_mul_const_v16i64_v16i8:
2702 ; AVX2-SLOW: # %bb.0:
2703 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
2704 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
2705 ; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2
2706 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
2707 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
2708 ; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3
2709 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
2710 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2711 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
2712 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
2713 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
2714 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
2715 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
2716 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2717 ; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
2718 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
2719 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2720 ; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
2721 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2722 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
2723 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2724 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
2725 ; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2726 ; AVX2-SLOW-NEXT: vzeroupper
2727 ; AVX2-SLOW-NEXT: retq
2729 ; AVX2-FAST-LABEL: trunc_mul_const_v16i64_v16i8:
2730 ; AVX2-FAST: # %bb.0:
2731 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
2732 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
2733 ; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2
2734 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
2735 ; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3
2736 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
2737 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2738 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
2739 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
2740 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
2741 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2
2742 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
2743 ; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
2744 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
2745 ; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
2746 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2747 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
2748 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2749 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0
2750 ; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2751 ; AVX2-FAST-NEXT: vzeroupper
2752 ; AVX2-FAST-NEXT: retq
2754 ; AVX512-LABEL: trunc_mul_const_v16i64_v16i8:
2756 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
2757 ; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
2758 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1
2759 ; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1
2760 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2761 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
2762 ; AVX512-NEXT: vzeroupper
2764 %1 = mul <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
2765 %2 = trunc <16 x i64> %1 to <16 x i8>
2769 define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
2770 ; SSE-LABEL: trunc_mul_const_v16i32_v16i8:
2772 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,1,2,3]
2773 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
2774 ; SSE-NEXT: pmuludq %xmm4, %xmm0
2775 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2776 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2777 ; SSE-NEXT: pmuludq %xmm5, %xmm4
2778 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2779 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
2780 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [4,5,6,7]
2781 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
2782 ; SSE-NEXT: pmuludq %xmm4, %xmm1
2783 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2784 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2785 ; SSE-NEXT: pmuludq %xmm5, %xmm4
2786 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2787 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
2788 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [8,9,10,11]
2789 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
2790 ; SSE-NEXT: pmuludq %xmm4, %xmm2
2791 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2792 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2793 ; SSE-NEXT: pmuludq %xmm5, %xmm4
2794 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2795 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
2796 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [12,13,14,15]
2797 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
2798 ; SSE-NEXT: pmuludq %xmm4, %xmm3
2799 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
2800 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2801 ; SSE-NEXT: pmuludq %xmm5, %xmm4
2802 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2803 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
2804 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
2805 ; SSE-NEXT: pand %xmm4, %xmm3
2806 ; SSE-NEXT: pand %xmm4, %xmm2
2807 ; SSE-NEXT: packuswb %xmm3, %xmm2
2808 ; SSE-NEXT: pand %xmm4, %xmm1
2809 ; SSE-NEXT: pand %xmm4, %xmm0
2810 ; SSE-NEXT: packuswb %xmm1, %xmm0
2811 ; SSE-NEXT: packuswb %xmm2, %xmm0
2814 ; AVX1-LABEL: trunc_mul_const_v16i32_v16i8:
2816 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
2817 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2818 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
2819 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm3
2820 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2821 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
2822 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
2823 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
2824 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
2825 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
2826 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
2827 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
2828 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
2829 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2830 ; AVX1-NEXT: vzeroupper
2833 ; AVX2-LABEL: trunc_mul_const_v16i32_v16i8:
2835 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2836 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
2837 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2838 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1
2839 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2840 ; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
2841 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
2842 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2843 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
2844 ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
2845 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2846 ; AVX2-NEXT: vzeroupper
2849 ; AVX512-LABEL: trunc_mul_const_v16i32_v16i8:
2851 ; AVX512-NEXT: vpmulld {{.*}}(%rip), %zmm0, %zmm0
2852 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
2853 ; AVX512-NEXT: vzeroupper
2855 %1 = mul <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2856 %2 = trunc <16 x i32> %1 to <16 x i8>
2860 define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
2861 ; SSE-LABEL: trunc_mul_const_v16i16_v16i8:
2863 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
2864 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1
2865 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
2866 ; SSE-NEXT: pand %xmm2, %xmm1
2867 ; SSE-NEXT: pand %xmm2, %xmm0
2868 ; SSE-NEXT: packuswb %xmm1, %xmm0
2871 ; AVX1-LABEL: trunc_mul_const_v16i16_v16i8:
2873 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
2874 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2875 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
2876 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
2877 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
2878 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
2879 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
2880 ; AVX1-NEXT: vzeroupper
2883 ; AVX2-LABEL: trunc_mul_const_v16i16_v16i8:
2885 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
2886 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
2887 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2888 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2889 ; AVX2-NEXT: vzeroupper
2892 ; AVX512F-LABEL: trunc_mul_const_v16i16_v16i8:
2894 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
2895 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2896 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
2897 ; AVX512F-NEXT: vzeroupper
2898 ; AVX512F-NEXT: retq
2900 ; AVX512BW-LABEL: trunc_mul_const_v16i16_v16i8:
2901 ; AVX512BW: # %bb.0:
2902 ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
2903 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
2904 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2905 ; AVX512BW-NEXT: vzeroupper
2906 ; AVX512BW-NEXT: retq
2908 ; AVX512DQ-LABEL: trunc_mul_const_v16i16_v16i8:
2909 ; AVX512DQ: # %bb.0:
2910 ; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
2911 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2912 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
2913 ; AVX512DQ-NEXT: vzeroupper
2914 ; AVX512DQ-NEXT: retq
2915 %1 = mul <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
2916 %2 = trunc <16 x i16> %1 to <16 x i8>
2924 define <4 x i32> @trunc_and_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2925 ; SSE-LABEL: trunc_and_v4i64_v4i32:
2927 ; SSE-NEXT: andps %xmm3, %xmm1
2928 ; SSE-NEXT: andps %xmm2, %xmm0
2929 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2932 ; AVX1-LABEL: trunc_and_v4i64_v4i32:
2934 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
2935 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2936 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2937 ; AVX1-NEXT: vzeroupper
2940 ; AVX2-SLOW-LABEL: trunc_and_v4i64_v4i32:
2941 ; AVX2-SLOW: # %bb.0:
2942 ; AVX2-SLOW-NEXT: vandps %ymm1, %ymm0, %ymm0
2943 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
2944 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
2945 ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2946 ; AVX2-SLOW-NEXT: vzeroupper
2947 ; AVX2-SLOW-NEXT: retq
2949 ; AVX2-FAST-LABEL: trunc_and_v4i64_v4i32:
2950 ; AVX2-FAST: # %bb.0:
2951 ; AVX2-FAST-NEXT: vandps %ymm1, %ymm0, %ymm0
2952 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
2953 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
2954 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2955 ; AVX2-FAST-NEXT: vzeroupper
2956 ; AVX2-FAST-NEXT: retq
2958 ; AVX512-LABEL: trunc_and_v4i64_v4i32:
2960 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
2961 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
2962 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2963 ; AVX512-NEXT: vzeroupper
2965 %1 = and <4 x i64> %a0, %a1
2966 %2 = trunc <4 x i64> %1 to <4 x i32>
2970 define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
2971 ; SSE-LABEL: trunc_and_v8i64_v8i16:
2973 ; SSE-NEXT: pand %xmm6, %xmm2
2974 ; SSE-NEXT: pand %xmm7, %xmm3
2975 ; SSE-NEXT: pand %xmm4, %xmm0
2976 ; SSE-NEXT: pand %xmm5, %xmm1
2977 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2978 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
2979 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2980 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
2981 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
2982 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
2983 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
2984 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
2985 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
2986 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2987 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
2990 ; AVX1-LABEL: trunc_and_v8i64_v8i16:
2992 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
2993 ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
2994 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2995 ; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
2996 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
2997 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
2998 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
2999 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3000 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
3001 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
3002 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
3003 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3004 ; AVX1-NEXT: vzeroupper
3007 ; AVX2-SLOW-LABEL: trunc_and_v8i64_v8i16:
3008 ; AVX2-SLOW: # %bb.0:
3009 ; AVX2-SLOW-NEXT: vpand %ymm3, %ymm1, %ymm1
3010 ; AVX2-SLOW-NEXT: vpand %ymm2, %ymm0, %ymm0
3011 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
3012 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3013 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
3014 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
3015 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3016 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3017 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3018 ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3019 ; AVX2-SLOW-NEXT: vzeroupper
3020 ; AVX2-SLOW-NEXT: retq
3022 ; AVX2-FAST-LABEL: trunc_and_v8i64_v8i16:
3023 ; AVX2-FAST: # %bb.0:
3024 ; AVX2-FAST-NEXT: vpand %ymm3, %ymm1, %ymm1
3025 ; AVX2-FAST-NEXT: vpand %ymm2, %ymm0, %ymm0
3026 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
3027 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
3028 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
3029 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3030 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3031 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3032 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3033 ; AVX2-FAST-NEXT: vzeroupper
3034 ; AVX2-FAST-NEXT: retq
3036 ; AVX512-LABEL: trunc_and_v8i64_v8i16:
3038 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
3039 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
3040 ; AVX512-NEXT: vzeroupper
3042 %1 = and <8 x i64> %a0, %a1
3043 %2 = trunc <8 x i64> %1 to <8 x i16>
3047 define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
3048 ; SSE-LABEL: trunc_and_v8i32_v8i16:
3050 ; SSE-NEXT: pand %xmm2, %xmm0
3051 ; SSE-NEXT: pand %xmm3, %xmm1
3052 ; SSE-NEXT: pslld $16, %xmm1
3053 ; SSE-NEXT: psrad $16, %xmm1
3054 ; SSE-NEXT: pslld $16, %xmm0
3055 ; SSE-NEXT: psrad $16, %xmm0
3056 ; SSE-NEXT: packssdw %xmm1, %xmm0
3059 ; AVX1-LABEL: trunc_and_v8i32_v8i16:
3061 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
3062 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3063 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
3064 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
3065 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
3066 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3067 ; AVX1-NEXT: vzeroupper
3070 ; AVX2-LABEL: trunc_and_v8i32_v8i16:
3072 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
3073 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3074 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3075 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3076 ; AVX2-NEXT: vzeroupper
3079 ; AVX512-LABEL: trunc_and_v8i32_v8i16:
3081 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
3082 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
3083 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3084 ; AVX512-NEXT: vzeroupper
3086 %1 = and <8 x i32> %a0, %a1
3087 %2 = trunc <8 x i32> %1 to <8 x i16>
3091 define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
3092 ; SSE-LABEL: trunc_and_v16i64_v16i8:
3094 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm0
3095 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm1
3096 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm2
3097 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm3
3098 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm4
3099 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm5
3100 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm6
3101 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm7
3102 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3103 ; SSE-NEXT: pand %xmm8, %xmm7
3104 ; SSE-NEXT: pand %xmm8, %xmm6
3105 ; SSE-NEXT: packuswb %xmm7, %xmm6
3106 ; SSE-NEXT: pand %xmm8, %xmm5
3107 ; SSE-NEXT: pand %xmm8, %xmm4
3108 ; SSE-NEXT: packuswb %xmm5, %xmm4
3109 ; SSE-NEXT: packuswb %xmm6, %xmm4
3110 ; SSE-NEXT: pand %xmm8, %xmm3
3111 ; SSE-NEXT: pand %xmm8, %xmm2
3112 ; SSE-NEXT: packuswb %xmm3, %xmm2
3113 ; SSE-NEXT: pand %xmm8, %xmm1
3114 ; SSE-NEXT: pand %xmm8, %xmm0
3115 ; SSE-NEXT: packuswb %xmm1, %xmm0
3116 ; SSE-NEXT: packuswb %xmm2, %xmm0
3117 ; SSE-NEXT: packuswb %xmm4, %xmm0
3120 ; AVX1-LABEL: trunc_and_v16i64_v16i8:
3122 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
3123 ; AVX1-NEXT: vandps %ymm5, %ymm1, %ymm1
3124 ; AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2
3125 ; AVX1-NEXT: vandps %ymm7, %ymm3, %ymm3
3126 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
3127 ; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321]
3128 ; AVX1-NEXT: # xmm5 = mem[0,0]
3129 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
3130 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
3131 ; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
3132 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
3133 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
3134 ; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2
3135 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
3136 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
3137 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3138 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
3139 ; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1
3140 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3141 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
3142 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
3143 ; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0
3144 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
3145 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3146 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
3147 ; AVX1-NEXT: vzeroupper
3150 ; AVX2-SLOW-LABEL: trunc_and_v16i64_v16i8:
3151 ; AVX2-SLOW: # %bb.0:
3152 ; AVX2-SLOW-NEXT: vpand %ymm5, %ymm1, %ymm1
3153 ; AVX2-SLOW-NEXT: vpand %ymm4, %ymm0, %ymm0
3154 ; AVX2-SLOW-NEXT: vpand %ymm7, %ymm3, %ymm3
3155 ; AVX2-SLOW-NEXT: vpand %ymm6, %ymm2, %ymm2
3156 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
3157 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
3158 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
3159 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
3160 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
3161 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3162 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
3163 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
3164 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
3165 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
3166 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
3167 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3168 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
3169 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
3170 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3171 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
3172 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3173 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
3174 ; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
3175 ; AVX2-SLOW-NEXT: vzeroupper
3176 ; AVX2-SLOW-NEXT: retq
3178 ; AVX2-FAST-LABEL: trunc_and_v16i64_v16i8:
3179 ; AVX2-FAST: # %bb.0:
3180 ; AVX2-FAST-NEXT: vpand %ymm5, %ymm1, %ymm1
3181 ; AVX2-FAST-NEXT: vpand %ymm4, %ymm0, %ymm0
3182 ; AVX2-FAST-NEXT: vpand %ymm7, %ymm3, %ymm3
3183 ; AVX2-FAST-NEXT: vpand %ymm6, %ymm2, %ymm2
3184 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
3185 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
3186 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
3187 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
3188 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3189 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
3190 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
3191 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
3192 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2
3193 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
3194 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
3195 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3196 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
3197 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3198 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0
3199 ; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
3200 ; AVX2-FAST-NEXT: vzeroupper
3201 ; AVX2-FAST-NEXT: retq
3203 ; AVX512-LABEL: trunc_and_v16i64_v16i8:
3205 ; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1
3206 ; AVX512-NEXT: vpandq %zmm2, %zmm0, %zmm0
3207 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
3208 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1
3209 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
3210 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
3211 ; AVX512-NEXT: vzeroupper
3213 %1 = and <16 x i64> %a0, %a1
3214 %2 = trunc <16 x i64> %1 to <16 x i8>
3218 define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
3219 ; SSE-LABEL: trunc_and_v16i32_v16i8:
3221 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3222 ; SSE-NEXT: pand %xmm8, %xmm7
3223 ; SSE-NEXT: pand %xmm3, %xmm7
3224 ; SSE-NEXT: pand %xmm8, %xmm6
3225 ; SSE-NEXT: pand %xmm2, %xmm6
3226 ; SSE-NEXT: packuswb %xmm7, %xmm6
3227 ; SSE-NEXT: pand %xmm8, %xmm5
3228 ; SSE-NEXT: pand %xmm1, %xmm5
3229 ; SSE-NEXT: pand %xmm8, %xmm4
3230 ; SSE-NEXT: pand %xmm4, %xmm0
3231 ; SSE-NEXT: packuswb %xmm5, %xmm0
3232 ; SSE-NEXT: packuswb %xmm6, %xmm0
3235 ; AVX1-LABEL: trunc_and_v16i32_v16i8:
3237 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
3238 ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
3239 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
3240 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
3241 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
3242 ; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
3243 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
3244 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3245 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
3246 ; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
3247 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
3248 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3249 ; AVX1-NEXT: vzeroupper
3252 ; AVX2-LABEL: trunc_and_v16i32_v16i8:
3254 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
3255 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
3256 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3257 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
3258 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
3259 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
3260 ; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
3261 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
3262 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3263 ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
3264 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3265 ; AVX2-NEXT: vzeroupper
3268 ; AVX512-LABEL: trunc_and_v16i32_v16i8:
3270 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
3271 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
3272 ; AVX512-NEXT: vzeroupper
3274 %1 = and <16 x i32> %a0, %a1
3275 %2 = trunc <16 x i32> %1 to <16 x i8>
3279 define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
3280 ; SSE-LABEL: trunc_and_v16i16_v16i8:
3282 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
3283 ; SSE-NEXT: pand %xmm4, %xmm3
3284 ; SSE-NEXT: pand %xmm1, %xmm3
3285 ; SSE-NEXT: pand %xmm4, %xmm2
3286 ; SSE-NEXT: pand %xmm2, %xmm0
3287 ; SSE-NEXT: packuswb %xmm3, %xmm0
3290 ; AVX1-LABEL: trunc_and_v16i16_v16i8:
3292 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
3293 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
3294 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3295 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3296 ; AVX1-NEXT: vzeroupper
3299 ; AVX2-LABEL: trunc_and_v16i16_v16i8:
3301 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
3302 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
3303 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3304 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3305 ; AVX2-NEXT: vzeroupper
3308 ; AVX512F-LABEL: trunc_and_v16i16_v16i8:
3310 ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
3311 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3312 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
3313 ; AVX512F-NEXT: vzeroupper
3314 ; AVX512F-NEXT: retq
3316 ; AVX512BW-LABEL: trunc_and_v16i16_v16i8:
3317 ; AVX512BW: # %bb.0:
3318 ; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0
3319 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
3320 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3321 ; AVX512BW-NEXT: vzeroupper
3322 ; AVX512BW-NEXT: retq
3324 ; AVX512DQ-LABEL: trunc_and_v16i16_v16i8:
3325 ; AVX512DQ: # %bb.0:
3326 ; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0
3327 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3328 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
3329 ; AVX512DQ-NEXT: vzeroupper
3330 ; AVX512DQ-NEXT: retq
3331 %1 = and <16 x i16> %a0, %a1
3332 %2 = trunc <16 x i16> %1 to <16 x i8>
3340 define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
3341 ; SSE-LABEL: trunc_and_const_v4i64_v4i32:
3343 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3344 ; SSE-NEXT: andps {{.*}}(%rip), %xmm0
3347 ; AVX1-LABEL: trunc_and_const_v4i64_v4i32:
3349 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3350 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3351 ; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
3352 ; AVX1-NEXT: vzeroupper
3355 ; AVX2-SLOW-LABEL: trunc_and_const_v4i64_v4i32:
3356 ; AVX2-SLOW: # %bb.0:
3357 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
3358 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
3359 ; AVX2-SLOW-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
3360 ; AVX2-SLOW-NEXT: vzeroupper
3361 ; AVX2-SLOW-NEXT: retq
3363 ; AVX2-FAST-LABEL: trunc_and_const_v4i64_v4i32:
3364 ; AVX2-FAST: # %bb.0:
3365 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
3366 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
3367 ; AVX2-FAST-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
3368 ; AVX2-FAST-NEXT: vzeroupper
3369 ; AVX2-FAST-NEXT: retq
3371 ; AVX512-LABEL: trunc_and_const_v4i64_v4i32:
3373 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
3374 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
3375 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3376 ; AVX512-NEXT: vzeroupper
3378 %1 = and <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
3379 %2 = trunc <4 x i64> %1 to <4 x i32>
3383 define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
3384 ; SSE-LABEL: trunc_and_const_v8i64_v8i16:
3386 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3387 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
3388 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3389 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
3390 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3391 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
3392 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
3393 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
3394 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
3395 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3396 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
3397 ; SSE-NEXT: andpd {{.*}}(%rip), %xmm0
3400 ; AVX1-LABEL: trunc_and_const_v8i64_v8i16:
3402 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
3403 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
3404 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
3405 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
3406 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
3407 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3408 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
3409 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
3410 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
3411 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3412 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3413 ; AVX1-NEXT: vzeroupper
3416 ; AVX2-SLOW-LABEL: trunc_and_const_v8i64_v8i16:
3417 ; AVX2-SLOW: # %bb.0:
3418 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
3419 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3420 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
3421 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
3422 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3423 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3424 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3425 ; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3426 ; AVX2-SLOW-NEXT: vzeroupper
3427 ; AVX2-SLOW-NEXT: retq
3429 ; AVX2-FAST-LABEL: trunc_and_const_v8i64_v8i16:
3430 ; AVX2-FAST: # %bb.0:
3431 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
3432 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
3433 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
3434 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3435 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3436 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3437 ; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3438 ; AVX2-FAST-NEXT: vzeroupper
3439 ; AVX2-FAST-NEXT: retq
3441 ; AVX512-LABEL: trunc_and_const_v8i64_v8i16:
3443 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
3444 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3445 ; AVX512-NEXT: vzeroupper
3447 %1 = and <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
3448 %2 = trunc <8 x i64> %1 to <8 x i16>
3452 define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
3453 ; SSE-LABEL: trunc_and_const_v8i32_v8i16:
3455 ; SSE-NEXT: pslld $16, %xmm1
3456 ; SSE-NEXT: psrad $16, %xmm1
3457 ; SSE-NEXT: pslld $16, %xmm0
3458 ; SSE-NEXT: psrad $16, %xmm0
3459 ; SSE-NEXT: packssdw %xmm1, %xmm0
3460 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
3463 ; AVX1-LABEL: trunc_and_const_v8i32_v8i16:
3465 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3466 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
3467 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
3468 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
3469 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3470 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3471 ; AVX1-NEXT: vzeroupper
3474 ; AVX2-LABEL: trunc_and_const_v8i32_v8i16:
3476 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3477 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3478 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3479 ; AVX2-NEXT: vzeroupper
3482 ; AVX512-LABEL: trunc_and_const_v8i32_v8i16:
3484 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
3485 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
3486 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3487 ; AVX512-NEXT: vzeroupper
3489 %1 = and <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3490 %2 = trunc <8 x i32> %1 to <8 x i16>
3494 define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
3495 ; SSE-LABEL: trunc_and_const_v16i64_v16i8:
3497 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3498 ; SSE-NEXT: pand %xmm8, %xmm7
3499 ; SSE-NEXT: pand %xmm8, %xmm6
3500 ; SSE-NEXT: packuswb %xmm7, %xmm6
3501 ; SSE-NEXT: pand %xmm8, %xmm5
3502 ; SSE-NEXT: pand %xmm8, %xmm4
3503 ; SSE-NEXT: packuswb %xmm5, %xmm4
3504 ; SSE-NEXT: packuswb %xmm6, %xmm4
3505 ; SSE-NEXT: pand %xmm8, %xmm3
3506 ; SSE-NEXT: pand %xmm8, %xmm2
3507 ; SSE-NEXT: packuswb %xmm3, %xmm2
3508 ; SSE-NEXT: pand %xmm8, %xmm1
3509 ; SSE-NEXT: pand %xmm8, %xmm0
3510 ; SSE-NEXT: packuswb %xmm1, %xmm0
3511 ; SSE-NEXT: packuswb %xmm2, %xmm0
3512 ; SSE-NEXT: packuswb %xmm4, %xmm0
3513 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
3516 ; AVX1-LABEL: trunc_and_const_v16i64_v16i8:
3518 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
3519 ; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321]
3520 ; AVX1-NEXT: # xmm5 = mem[0,0]
3521 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
3522 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
3523 ; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
3524 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
3525 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
3526 ; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2
3527 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
3528 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
3529 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3530 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
3531 ; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1
3532 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3533 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
3534 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
3535 ; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0
3536 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
3537 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3538 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
3539 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3540 ; AVX1-NEXT: vzeroupper
3543 ; AVX2-SLOW-LABEL: trunc_and_const_v16i64_v16i8:
3544 ; AVX2-SLOW: # %bb.0:
3545 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
3546 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
3547 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
3548 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
3549 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
3550 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3551 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
3552 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
3553 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
3554 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
3555 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
3556 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3557 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
3558 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
3559 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3560 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
3561 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3562 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
3563 ; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
3564 ; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3565 ; AVX2-SLOW-NEXT: vzeroupper
3566 ; AVX2-SLOW-NEXT: retq
3568 ; AVX2-FAST-LABEL: trunc_and_const_v16i64_v16i8:
3569 ; AVX2-FAST: # %bb.0:
3570 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
3571 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
3572 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
3573 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
3574 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3575 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
3576 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
3577 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
3578 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2
3579 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
3580 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
3581 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3582 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
3583 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3584 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0
3585 ; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
3586 ; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3587 ; AVX2-FAST-NEXT: vzeroupper
3588 ; AVX2-FAST-NEXT: retq
3590 ; AVX512-LABEL: trunc_and_const_v16i64_v16i8:
3592 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
3593 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1
3594 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
3595 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
3596 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3597 ; AVX512-NEXT: vzeroupper
3599 %1 = and <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
3600 %2 = trunc <16 x i64> %1 to <16 x i8>
3604 define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
3605 ; SSE-LABEL: trunc_and_const_v16i32_v16i8:
3607 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3608 ; SSE-NEXT: pand %xmm4, %xmm3
3609 ; SSE-NEXT: pand %xmm4, %xmm2
3610 ; SSE-NEXT: packuswb %xmm3, %xmm2
3611 ; SSE-NEXT: pand %xmm4, %xmm1
3612 ; SSE-NEXT: pand %xmm4, %xmm0
3613 ; SSE-NEXT: packuswb %xmm1, %xmm0
3614 ; SSE-NEXT: packuswb %xmm2, %xmm0
3615 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
3618 ; AVX1-LABEL: trunc_and_const_v16i32_v16i8:
3620 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
3621 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
3622 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
3623 ; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
3624 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
3625 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3626 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
3627 ; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
3628 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
3629 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3630 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3631 ; AVX1-NEXT: vzeroupper
3634 ; AVX2-LABEL: trunc_and_const_v16i32_v16i8:
3636 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3637 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
3638 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
3639 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
3640 ; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
3641 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
3642 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3643 ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
3644 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3645 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3646 ; AVX2-NEXT: vzeroupper
3649 ; AVX512-LABEL: trunc_and_const_v16i32_v16i8:
3651 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
3652 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3653 ; AVX512-NEXT: vzeroupper
3655 %1 = and <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3656 %2 = trunc <16 x i32> %1 to <16 x i8>
3660 define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
3661 ; SSE-LABEL: trunc_and_const_v16i16_v16i8:
3663 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
3664 ; SSE-NEXT: pand %xmm2, %xmm1
3665 ; SSE-NEXT: pand %xmm2, %xmm0
3666 ; SSE-NEXT: packuswb %xmm1, %xmm0
3667 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
3670 ; AVX1-LABEL: trunc_and_const_v16i16_v16i8:
3672 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
3673 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3674 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3675 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3676 ; AVX1-NEXT: vzeroupper
3679 ; AVX2-LABEL: trunc_and_const_v16i16_v16i8:
3681 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
3682 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3683 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3684 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3685 ; AVX2-NEXT: vzeroupper
3688 ; AVX512F-LABEL: trunc_and_const_v16i16_v16i8:
3690 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3691 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
3692 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3693 ; AVX512F-NEXT: vzeroupper
3694 ; AVX512F-NEXT: retq
3696 ; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8:
3697 ; AVX512BW: # %bb.0:
3698 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
3699 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
3700 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3701 ; AVX512BW-NEXT: vzeroupper
3702 ; AVX512BW-NEXT: retq
3704 ; AVX512DQ-LABEL: trunc_and_const_v16i16_v16i8:
3705 ; AVX512DQ: # %bb.0:
3706 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3707 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
3708 ; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3709 ; AVX512DQ-NEXT: vzeroupper
3710 ; AVX512DQ-NEXT: retq
3711 %1 = and <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
3712 %2 = trunc <16 x i16> %1 to <16 x i8>
3720 define <4 x i32> @trunc_xor_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
3721 ; SSE-LABEL: trunc_xor_v4i64_v4i32:
3723 ; SSE-NEXT: xorps %xmm3, %xmm1
3724 ; SSE-NEXT: xorps %xmm2, %xmm0
3725 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3728 ; AVX1-LABEL: trunc_xor_v4i64_v4i32:
3730 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
3731 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3732 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3733 ; AVX1-NEXT: vzeroupper
3736 ; AVX2-SLOW-LABEL: trunc_xor_v4i64_v4i32:
3737 ; AVX2-SLOW: # %bb.0:
3738 ; AVX2-SLOW-NEXT: vxorps %ymm1, %ymm0, %ymm0
3739 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
3740 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
3741 ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3742 ; AVX2-SLOW-NEXT: vzeroupper
3743 ; AVX2-SLOW-NEXT: retq
3745 ; AVX2-FAST-LABEL: trunc_xor_v4i64_v4i32:
3746 ; AVX2-FAST: # %bb.0:
3747 ; AVX2-FAST-NEXT: vxorps %ymm1, %ymm0, %ymm0
3748 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
3749 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
3750 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3751 ; AVX2-FAST-NEXT: vzeroupper
3752 ; AVX2-FAST-NEXT: retq
3754 ; AVX512-LABEL: trunc_xor_v4i64_v4i32:
3756 ; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
3757 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
3758 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3759 ; AVX512-NEXT: vzeroupper
3761 %1 = xor <4 x i64> %a0, %a1
3762 %2 = trunc <4 x i64> %1 to <4 x i32>
3766 define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
3767 ; SSE-LABEL: trunc_xor_v8i64_v8i16:
3769 ; SSE-NEXT: pxor %xmm6, %xmm2
3770 ; SSE-NEXT: pxor %xmm7, %xmm3
3771 ; SSE-NEXT: pxor %xmm4, %xmm0
3772 ; SSE-NEXT: pxor %xmm5, %xmm1
3773 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3774 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
3775 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3776 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
3777 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3778 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
3779 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
3780 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
3781 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
3782 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3783 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
3786 ; AVX1-LABEL: trunc_xor_v8i64_v8i16:
3788 ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
3789 ; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1
3790 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
3791 ; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
3792 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
3793 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
3794 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
3795 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3796 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
3797 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
3798 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
3799 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3800 ; AVX1-NEXT: vzeroupper
3803 ; AVX2-SLOW-LABEL: trunc_xor_v8i64_v8i16:
3804 ; AVX2-SLOW: # %bb.0:
3805 ; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm1
3806 ; AVX2-SLOW-NEXT: vpxor %ymm2, %ymm0, %ymm0
3807 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
3808 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3809 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
3810 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
3811 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3812 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3813 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3814 ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3815 ; AVX2-SLOW-NEXT: vzeroupper
3816 ; AVX2-SLOW-NEXT: retq
3818 ; AVX2-FAST-LABEL: trunc_xor_v8i64_v8i16:
3819 ; AVX2-FAST: # %bb.0:
3820 ; AVX2-FAST-NEXT: vpxor %ymm3, %ymm1, %ymm1
3821 ; AVX2-FAST-NEXT: vpxor %ymm2, %ymm0, %ymm0
3822 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
3823 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
3824 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
3825 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3826 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3827 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3828 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3829 ; AVX2-FAST-NEXT: vzeroupper
3830 ; AVX2-FAST-NEXT: retq
3832 ; AVX512-LABEL: trunc_xor_v8i64_v8i16:
3834 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
3835 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
3836 ; AVX512-NEXT: vzeroupper
3838 %1 = xor <8 x i64> %a0, %a1
3839 %2 = trunc <8 x i64> %1 to <8 x i16>
3843 define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
3844 ; SSE-LABEL: trunc_xor_v8i32_v8i16:
3846 ; SSE-NEXT: pxor %xmm2, %xmm0
3847 ; SSE-NEXT: pxor %xmm3, %xmm1
3848 ; SSE-NEXT: pslld $16, %xmm1
3849 ; SSE-NEXT: psrad $16, %xmm1
3850 ; SSE-NEXT: pslld $16, %xmm0
3851 ; SSE-NEXT: psrad $16, %xmm0
3852 ; SSE-NEXT: packssdw %xmm1, %xmm0
3855 ; AVX1-LABEL: trunc_xor_v8i32_v8i16:
3857 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
3858 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3859 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
3860 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
3861 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
3862 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3863 ; AVX1-NEXT: vzeroupper
3866 ; AVX2-LABEL: trunc_xor_v8i32_v8i16:
3868 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
3869 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3870 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3871 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3872 ; AVX2-NEXT: vzeroupper
3875 ; AVX512-LABEL: trunc_xor_v8i32_v8i16:
3877 ; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
3878 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
3879 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3880 ; AVX512-NEXT: vzeroupper
3882 %1 = xor <8 x i32> %a0, %a1
3883 %2 = trunc <8 x i32> %1 to <8 x i16>
3887 define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
3888 ; SSE-LABEL: trunc_xor_v16i64_v16i8:
3890 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm0
3891 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm1
3892 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm2
3893 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm3
3894 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm4
3895 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm5
3896 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm6
3897 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm7
3898 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3899 ; SSE-NEXT: pand %xmm8, %xmm7
3900 ; SSE-NEXT: pand %xmm8, %xmm6
3901 ; SSE-NEXT: packuswb %xmm7, %xmm6
3902 ; SSE-NEXT: pand %xmm8, %xmm5
3903 ; SSE-NEXT: pand %xmm8, %xmm4
3904 ; SSE-NEXT: packuswb %xmm5, %xmm4
3905 ; SSE-NEXT: packuswb %xmm6, %xmm4
3906 ; SSE-NEXT: pand %xmm8, %xmm3
3907 ; SSE-NEXT: pand %xmm8, %xmm2
3908 ; SSE-NEXT: packuswb %xmm3, %xmm2
3909 ; SSE-NEXT: pand %xmm8, %xmm1
3910 ; SSE-NEXT: pand %xmm8, %xmm0
3911 ; SSE-NEXT: packuswb %xmm1, %xmm0
3912 ; SSE-NEXT: packuswb %xmm2, %xmm0
3913 ; SSE-NEXT: packuswb %xmm4, %xmm0
3916 ; AVX1-LABEL: trunc_xor_v16i64_v16i8:
3918 ; AVX1-NEXT: vxorps %ymm4, %ymm0, %ymm0
3919 ; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1
3920 ; AVX1-NEXT: vxorps %ymm6, %ymm2, %ymm2
3921 ; AVX1-NEXT: vxorps %ymm7, %ymm3, %ymm3
3922 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
3923 ; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321]
3924 ; AVX1-NEXT: # xmm5 = mem[0,0]
3925 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
3926 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
3927 ; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
3928 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
3929 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
3930 ; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2
3931 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
3932 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
3933 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3934 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
3935 ; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1
3936 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3937 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
3938 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
3939 ; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0
3940 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
3941 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3942 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
3943 ; AVX1-NEXT: vzeroupper
3946 ; AVX2-SLOW-LABEL: trunc_xor_v16i64_v16i8:
3947 ; AVX2-SLOW: # %bb.0:
3948 ; AVX2-SLOW-NEXT: vpxor %ymm5, %ymm1, %ymm1
3949 ; AVX2-SLOW-NEXT: vpxor %ymm4, %ymm0, %ymm0
3950 ; AVX2-SLOW-NEXT: vpxor %ymm7, %ymm3, %ymm3
3951 ; AVX2-SLOW-NEXT: vpxor %ymm6, %ymm2, %ymm2
3952 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
3953 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
3954 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
3955 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
3956 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
3957 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3958 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
3959 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
3960 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
3961 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
3962 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
3963 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3964 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
3965 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
3966 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3967 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
3968 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3969 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
3970 ; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
3971 ; AVX2-SLOW-NEXT: vzeroupper
3972 ; AVX2-SLOW-NEXT: retq
3974 ; AVX2-FAST-LABEL: trunc_xor_v16i64_v16i8:
3975 ; AVX2-FAST: # %bb.0:
3976 ; AVX2-FAST-NEXT: vpxor %ymm5, %ymm1, %ymm1
3977 ; AVX2-FAST-NEXT: vpxor %ymm4, %ymm0, %ymm0
3978 ; AVX2-FAST-NEXT: vpxor %ymm7, %ymm3, %ymm3
3979 ; AVX2-FAST-NEXT: vpxor %ymm6, %ymm2, %ymm2
3980 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
3981 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
3982 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
3983 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
3984 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3985 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
3986 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
3987 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
3988 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2
3989 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
3990 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
3991 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3992 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
3993 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3994 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0
3995 ; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
3996 ; AVX2-FAST-NEXT: vzeroupper
3997 ; AVX2-FAST-NEXT: retq
3999 ; AVX512-LABEL: trunc_xor_v16i64_v16i8:
4001 ; AVX512-NEXT: vpxorq %zmm3, %zmm1, %zmm1
4002 ; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0
4003 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
4004 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1
4005 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4006 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
4007 ; AVX512-NEXT: vzeroupper
4009 %1 = xor <16 x i64> %a0, %a1
4010 %2 = trunc <16 x i64> %1 to <16 x i8>
4014 define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
4015 ; SSE-LABEL: trunc_xor_v16i32_v16i8:
4017 ; SSE-NEXT: pxor %xmm4, %xmm0
4018 ; SSE-NEXT: pxor %xmm5, %xmm1
4019 ; SSE-NEXT: pxor %xmm6, %xmm2
4020 ; SSE-NEXT: pxor %xmm7, %xmm3
4021 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4022 ; SSE-NEXT: pand %xmm4, %xmm3
4023 ; SSE-NEXT: pand %xmm4, %xmm2
4024 ; SSE-NEXT: packuswb %xmm3, %xmm2
4025 ; SSE-NEXT: pand %xmm4, %xmm1
4026 ; SSE-NEXT: pand %xmm4, %xmm0
4027 ; SSE-NEXT: packuswb %xmm1, %xmm0
4028 ; SSE-NEXT: packuswb %xmm2, %xmm0
4031 ; AVX1-LABEL: trunc_xor_v16i32_v16i8:
4033 ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
4034 ; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1
4035 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
4036 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
4037 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
4038 ; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
4039 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
4040 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
4041 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
4042 ; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
4043 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
4044 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4045 ; AVX1-NEXT: vzeroupper
4048 ; AVX2-LABEL: trunc_xor_v16i32_v16i8:
4050 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
4051 ; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
4052 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4053 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
4054 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
4055 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
4056 ; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
4057 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
4058 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4059 ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
4060 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4061 ; AVX2-NEXT: vzeroupper
4064 ; AVX512-LABEL: trunc_xor_v16i32_v16i8:
4066 ; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0
4067 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
4068 ; AVX512-NEXT: vzeroupper
4070 %1 = xor <16 x i32> %a0, %a1
4071 %2 = trunc <16 x i32> %1 to <16 x i8>
4075 define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
4076 ; SSE-LABEL: trunc_xor_v16i16_v16i8:
4078 ; SSE-NEXT: pxor %xmm2, %xmm0
4079 ; SSE-NEXT: pxor %xmm3, %xmm1
4080 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
4081 ; SSE-NEXT: pand %xmm2, %xmm1
4082 ; SSE-NEXT: pand %xmm2, %xmm0
4083 ; SSE-NEXT: packuswb %xmm1, %xmm0
4086 ; AVX1-LABEL: trunc_xor_v16i16_v16i8:
4088 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
4089 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
4090 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4091 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4092 ; AVX1-NEXT: vzeroupper
4095 ; AVX2-LABEL: trunc_xor_v16i16_v16i8:
4097 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
4098 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
4099 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4100 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4101 ; AVX2-NEXT: vzeroupper
4104 ; AVX512F-LABEL: trunc_xor_v16i16_v16i8:
4106 ; AVX512F-NEXT: vpxor %ymm1, %ymm0, %ymm0
4107 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4108 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
4109 ; AVX512F-NEXT: vzeroupper
4110 ; AVX512F-NEXT: retq
4112 ; AVX512BW-LABEL: trunc_xor_v16i16_v16i8:
4113 ; AVX512BW: # %bb.0:
4114 ; AVX512BW-NEXT: vpxor %ymm1, %ymm0, %ymm0
4115 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
4116 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
4117 ; AVX512BW-NEXT: vzeroupper
4118 ; AVX512BW-NEXT: retq
4120 ; AVX512DQ-LABEL: trunc_xor_v16i16_v16i8:
4121 ; AVX512DQ: # %bb.0:
4122 ; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0
4123 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4124 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
4125 ; AVX512DQ-NEXT: vzeroupper
4126 ; AVX512DQ-NEXT: retq
4127 %1 = xor <16 x i16> %a0, %a1
4128 %2 = trunc <16 x i16> %1 to <16 x i8>
4136 define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
4137 ; SSE-LABEL: trunc_xor_const_v4i64_v4i32:
4139 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4140 ; SSE-NEXT: xorps {{.*}}(%rip), %xmm0
4143 ; AVX1-LABEL: trunc_xor_const_v4i64_v4i32:
4145 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4146 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4147 ; AVX1-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
4148 ; AVX1-NEXT: vzeroupper
4151 ; AVX2-SLOW-LABEL: trunc_xor_const_v4i64_v4i32:
4152 ; AVX2-SLOW: # %bb.0:
4153 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
4154 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
4155 ; AVX2-SLOW-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
4156 ; AVX2-SLOW-NEXT: vzeroupper
4157 ; AVX2-SLOW-NEXT: retq
4159 ; AVX2-FAST-LABEL: trunc_xor_const_v4i64_v4i32:
4160 ; AVX2-FAST: # %bb.0:
4161 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
4162 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
4163 ; AVX2-FAST-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
4164 ; AVX2-FAST-NEXT: vzeroupper
4165 ; AVX2-FAST-NEXT: retq
4167 ; AVX512-LABEL: trunc_xor_const_v4i64_v4i32:
4169 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
4170 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
4171 ; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4172 ; AVX512-NEXT: vzeroupper
4174 %1 = xor <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
4175 %2 = trunc <4 x i64> %1 to <4 x i32>
4179 define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
4180 ; SSE-LABEL: trunc_xor_const_v8i64_v8i16:
4182 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4183 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
4184 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4185 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
4186 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
4187 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
4188 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
4189 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
4190 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
4191 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4192 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
4193 ; SSE-NEXT: xorpd {{.*}}(%rip), %xmm0
4196 ; AVX1-LABEL: trunc_xor_const_v8i64_v8i16:
4198 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
4199 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
4200 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
4201 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
4202 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
4203 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
4204 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
4205 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
4206 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
4207 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
4208 ; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4209 ; AVX1-NEXT: vzeroupper
4212 ; AVX2-SLOW-LABEL: trunc_xor_const_v8i64_v8i16:
4213 ; AVX2-SLOW: # %bb.0:
4214 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
4215 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4216 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
4217 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
4218 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
4219 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4220 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4221 ; AVX2-SLOW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4222 ; AVX2-SLOW-NEXT: vzeroupper
4223 ; AVX2-SLOW-NEXT: retq
4225 ; AVX2-FAST-LABEL: trunc_xor_const_v8i64_v8i16:
4226 ; AVX2-FAST: # %bb.0:
4227 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
4228 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
4229 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
4230 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
4231 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4232 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4233 ; AVX2-FAST-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4234 ; AVX2-FAST-NEXT: vzeroupper
4235 ; AVX2-FAST-NEXT: retq
4237 ; AVX512-LABEL: trunc_xor_const_v8i64_v8i16:
4239 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
4240 ; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4241 ; AVX512-NEXT: vzeroupper
4243 %1 = xor <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
4244 %2 = trunc <8 x i64> %1 to <8 x i16>
4248 define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
4249 ; SSE-LABEL: trunc_xor_const_v8i32_v8i16:
4251 ; SSE-NEXT: pslld $16, %xmm1
4252 ; SSE-NEXT: psrad $16, %xmm1
4253 ; SSE-NEXT: pslld $16, %xmm0
4254 ; SSE-NEXT: psrad $16, %xmm0
4255 ; SSE-NEXT: packssdw %xmm1, %xmm0
4256 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
4259 ; AVX1-LABEL: trunc_xor_const_v8i32_v8i16:
4261 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4262 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
4263 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
4264 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
4265 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4266 ; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4267 ; AVX1-NEXT: vzeroupper
4270 ; AVX2-LABEL: trunc_xor_const_v8i32_v8i16:
4272 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4273 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4274 ; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4275 ; AVX2-NEXT: vzeroupper
4278 ; AVX512-LABEL: trunc_xor_const_v8i32_v8i16:
4280 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
4281 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
4282 ; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4283 ; AVX512-NEXT: vzeroupper
4285 %1 = xor <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
4286 %2 = trunc <8 x i32> %1 to <8 x i16>
4290 define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
4291 ; SSE-LABEL: trunc_xor_const_v16i64_v16i8:
4293 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4294 ; SSE-NEXT: pand %xmm8, %xmm7
4295 ; SSE-NEXT: pand %xmm8, %xmm6
4296 ; SSE-NEXT: packuswb %xmm7, %xmm6
4297 ; SSE-NEXT: pand %xmm8, %xmm5
4298 ; SSE-NEXT: pand %xmm8, %xmm4
4299 ; SSE-NEXT: packuswb %xmm5, %xmm4
4300 ; SSE-NEXT: packuswb %xmm6, %xmm4
4301 ; SSE-NEXT: pand %xmm8, %xmm3
4302 ; SSE-NEXT: pand %xmm8, %xmm2
4303 ; SSE-NEXT: packuswb %xmm3, %xmm2
4304 ; SSE-NEXT: pand %xmm8, %xmm1
4305 ; SSE-NEXT: pand %xmm8, %xmm0
4306 ; SSE-NEXT: packuswb %xmm1, %xmm0
4307 ; SSE-NEXT: packuswb %xmm2, %xmm0
4308 ; SSE-NEXT: packuswb %xmm4, %xmm0
4309 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
4312 ; AVX1-LABEL: trunc_xor_const_v16i64_v16i8:
4314 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
4315 ; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321]
4316 ; AVX1-NEXT: # xmm5 = mem[0,0]
4317 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
4318 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
4319 ; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
4320 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
4321 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
4322 ; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2
4323 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
4324 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4325 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
4326 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
4327 ; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1
4328 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
4329 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
4330 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
4331 ; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0
4332 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
4333 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
4334 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
4335 ; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4336 ; AVX1-NEXT: vzeroupper
4339 ; AVX2-SLOW-LABEL: trunc_xor_const_v16i64_v16i8:
4340 ; AVX2-SLOW: # %bb.0:
4341 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
4342 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
4343 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
4344 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
4345 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
4346 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4347 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
4348 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
4349 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
4350 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
4351 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
4352 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4353 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
4354 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
4355 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
4356 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
4357 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4358 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
4359 ; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
4360 ; AVX2-SLOW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4361 ; AVX2-SLOW-NEXT: vzeroupper
4362 ; AVX2-SLOW-NEXT: retq
4364 ; AVX2-FAST-LABEL: trunc_xor_const_v16i64_v16i8:
4365 ; AVX2-FAST: # %bb.0:
4366 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
4367 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
4368 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
4369 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
4370 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4371 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
4372 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
4373 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
4374 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2
4375 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
4376 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
4377 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
4378 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
4379 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4380 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0
4381 ; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
4382 ; AVX2-FAST-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4383 ; AVX2-FAST-NEXT: vzeroupper
4384 ; AVX2-FAST-NEXT: retq
4386 ; AVX512-LABEL: trunc_xor_const_v16i64_v16i8:
4388 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
4389 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1
4390 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4391 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
4392 ; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4393 ; AVX512-NEXT: vzeroupper
4395 %1 = xor <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
4396 %2 = trunc <16 x i64> %1 to <16 x i8>
4400 define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
4401 ; SSE-LABEL: trunc_xor_const_v16i32_v16i8:
4403 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4404 ; SSE-NEXT: pand %xmm4, %xmm3
4405 ; SSE-NEXT: pand %xmm4, %xmm2
4406 ; SSE-NEXT: packuswb %xmm3, %xmm2
4407 ; SSE-NEXT: pand %xmm4, %xmm1
4408 ; SSE-NEXT: pand %xmm4, %xmm0
4409 ; SSE-NEXT: packuswb %xmm1, %xmm0
4410 ; SSE-NEXT: packuswb %xmm2, %xmm0
4411 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
4414 ; AVX1-LABEL: trunc_xor_const_v16i32_v16i8:
4416 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
4417 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
4418 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
4419 ; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
4420 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
4421 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
4422 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
4423 ; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
4424 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
4425 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4426 ; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4427 ; AVX1-NEXT: vzeroupper
4430 ; AVX2-LABEL: trunc_xor_const_v16i32_v16i8:
4432 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4433 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
4434 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
4435 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
4436 ; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
4437 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
4438 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4439 ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
4440 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4441 ; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4442 ; AVX2-NEXT: vzeroupper
4445 ; AVX512-LABEL: trunc_xor_const_v16i32_v16i8:
4447 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
4448 ; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4449 ; AVX512-NEXT: vzeroupper
4451 %1 = xor <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
4452 %2 = trunc <16 x i32> %1 to <16 x i8>
4456 define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
4457 ; SSE-LABEL: trunc_xor_const_v16i16_v16i8:
4459 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
4460 ; SSE-NEXT: pand %xmm2, %xmm1
4461 ; SSE-NEXT: pand %xmm2, %xmm0
4462 ; SSE-NEXT: packuswb %xmm1, %xmm0
4463 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
4466 ; AVX1-LABEL: trunc_xor_const_v16i16_v16i8:
4468 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
4469 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4470 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4471 ; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4472 ; AVX1-NEXT: vzeroupper
4475 ; AVX2-LABEL: trunc_xor_const_v16i16_v16i8:
4477 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
4478 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4479 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4480 ; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4481 ; AVX2-NEXT: vzeroupper
4484 ; AVX512F-LABEL: trunc_xor_const_v16i16_v16i8:
4486 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4487 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
4488 ; AVX512F-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4489 ; AVX512F-NEXT: vzeroupper
4490 ; AVX512F-NEXT: retq
4492 ; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8:
4493 ; AVX512BW: # %bb.0:
4494 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
4495 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
4496 ; AVX512BW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4497 ; AVX512BW-NEXT: vzeroupper
4498 ; AVX512BW-NEXT: retq
4500 ; AVX512DQ-LABEL: trunc_xor_const_v16i16_v16i8:
4501 ; AVX512DQ: # %bb.0:
4502 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4503 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
4504 ; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4505 ; AVX512DQ-NEXT: vzeroupper
4506 ; AVX512DQ-NEXT: retq
4507 %1 = xor <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
4508 %2 = trunc <16 x i16> %1 to <16 x i8>
4516 define <4 x i32> @trunc_or_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
4517 ; SSE-LABEL: trunc_or_v4i64_v4i32:
4519 ; SSE-NEXT: orps %xmm3, %xmm1
4520 ; SSE-NEXT: orps %xmm2, %xmm0
4521 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4524 ; AVX1-LABEL: trunc_or_v4i64_v4i32:
4526 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
4527 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4528 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4529 ; AVX1-NEXT: vzeroupper
4532 ; AVX2-SLOW-LABEL: trunc_or_v4i64_v4i32:
4533 ; AVX2-SLOW: # %bb.0:
4534 ; AVX2-SLOW-NEXT: vorps %ymm1, %ymm0, %ymm0
4535 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
4536 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
4537 ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
4538 ; AVX2-SLOW-NEXT: vzeroupper
4539 ; AVX2-SLOW-NEXT: retq
4541 ; AVX2-FAST-LABEL: trunc_or_v4i64_v4i32:
4542 ; AVX2-FAST: # %bb.0:
4543 ; AVX2-FAST-NEXT: vorps %ymm1, %ymm0, %ymm0
4544 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
4545 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
4546 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
4547 ; AVX2-FAST-NEXT: vzeroupper
4548 ; AVX2-FAST-NEXT: retq
4550 ; AVX512-LABEL: trunc_or_v4i64_v4i32:
4552 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
4553 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
4554 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
4555 ; AVX512-NEXT: vzeroupper
4557 %1 = or <4 x i64> %a0, %a1
4558 %2 = trunc <4 x i64> %1 to <4 x i32>
4562 define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
4563 ; SSE-LABEL: trunc_or_v8i64_v8i16:
4565 ; SSE-NEXT: por %xmm6, %xmm2
4566 ; SSE-NEXT: por %xmm7, %xmm3
4567 ; SSE-NEXT: por %xmm4, %xmm0
4568 ; SSE-NEXT: por %xmm5, %xmm1
4569 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4570 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
4571 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4572 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
4573 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
4574 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
4575 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
4576 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
4577 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
4578 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4579 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
4582 ; AVX1-LABEL: trunc_or_v8i64_v8i16:
4584 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
4585 ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
4586 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
4587 ; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
4588 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
4589 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
4590 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
4591 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
4592 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
4593 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
4594 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
4595 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
4596 ; AVX1-NEXT: vzeroupper
4599 ; AVX2-SLOW-LABEL: trunc_or_v8i64_v8i16:
4600 ; AVX2-SLOW: # %bb.0:
4601 ; AVX2-SLOW-NEXT: vpor %ymm3, %ymm1, %ymm1
4602 ; AVX2-SLOW-NEXT: vpor %ymm2, %ymm0, %ymm0
4603 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
4604 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4605 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
4606 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
4607 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
4608 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4609 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4610 ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
4611 ; AVX2-SLOW-NEXT: vzeroupper
4612 ; AVX2-SLOW-NEXT: retq
4614 ; AVX2-FAST-LABEL: trunc_or_v8i64_v8i16:
4615 ; AVX2-FAST: # %bb.0:
4616 ; AVX2-FAST-NEXT: vpor %ymm3, %ymm1, %ymm1
4617 ; AVX2-FAST-NEXT: vpor %ymm2, %ymm0, %ymm0
4618 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
4619 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
4620 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
4621 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
4622 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4623 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4624 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
4625 ; AVX2-FAST-NEXT: vzeroupper
4626 ; AVX2-FAST-NEXT: retq
4628 ; AVX512-LABEL: trunc_or_v8i64_v8i16:
4630 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
4631 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
4632 ; AVX512-NEXT: vzeroupper
4634 %1 = or <8 x i64> %a0, %a1
4635 %2 = trunc <8 x i64> %1 to <8 x i16>
4639 define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
4640 ; SSE-LABEL: trunc_or_v8i32_v8i16:
4642 ; SSE-NEXT: por %xmm2, %xmm0
4643 ; SSE-NEXT: por %xmm3, %xmm1
4644 ; SSE-NEXT: pslld $16, %xmm1
4645 ; SSE-NEXT: psrad $16, %xmm1
4646 ; SSE-NEXT: pslld $16, %xmm0
4647 ; SSE-NEXT: psrad $16, %xmm0
4648 ; SSE-NEXT: packssdw %xmm1, %xmm0
4651 ; AVX1-LABEL: trunc_or_v8i32_v8i16:
4653 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
4654 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4655 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
4656 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
4657 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
4658 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4659 ; AVX1-NEXT: vzeroupper
4662 ; AVX2-LABEL: trunc_or_v8i32_v8i16:
4664 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
4665 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4666 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4667 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
4668 ; AVX2-NEXT: vzeroupper
4671 ; AVX512-LABEL: trunc_or_v8i32_v8i16:
4673 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
4674 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
4675 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
4676 ; AVX512-NEXT: vzeroupper
4678 %1 = or <8 x i32> %a0, %a1
4679 %2 = trunc <8 x i32> %1 to <8 x i16>
4683 define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
4684 ; SSE-LABEL: trunc_or_v16i64_v16i8:
4686 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm0
4687 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm1
4688 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm2
4689 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm3
4690 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm4
4691 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm5
4692 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm6
4693 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm7
4694 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4695 ; SSE-NEXT: pand %xmm8, %xmm7
4696 ; SSE-NEXT: pand %xmm8, %xmm6
4697 ; SSE-NEXT: packuswb %xmm7, %xmm6
4698 ; SSE-NEXT: pand %xmm8, %xmm5
4699 ; SSE-NEXT: pand %xmm8, %xmm4
4700 ; SSE-NEXT: packuswb %xmm5, %xmm4
4701 ; SSE-NEXT: packuswb %xmm6, %xmm4
4702 ; SSE-NEXT: pand %xmm8, %xmm3
4703 ; SSE-NEXT: pand %xmm8, %xmm2
4704 ; SSE-NEXT: packuswb %xmm3, %xmm2
4705 ; SSE-NEXT: pand %xmm8, %xmm1
4706 ; SSE-NEXT: pand %xmm8, %xmm0
4707 ; SSE-NEXT: packuswb %xmm1, %xmm0
4708 ; SSE-NEXT: packuswb %xmm2, %xmm0
4709 ; SSE-NEXT: packuswb %xmm4, %xmm0
4712 ; AVX1-LABEL: trunc_or_v16i64_v16i8:
4714 ; AVX1-NEXT: vorps %ymm4, %ymm0, %ymm0
4715 ; AVX1-NEXT: vorps %ymm5, %ymm1, %ymm1
4716 ; AVX1-NEXT: vorps %ymm6, %ymm2, %ymm2
4717 ; AVX1-NEXT: vorps %ymm7, %ymm3, %ymm3
4718 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
4719 ; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321]
4720 ; AVX1-NEXT: # xmm5 = mem[0,0]
4721 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
4722 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
4723 ; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
4724 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
4725 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
4726 ; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2
4727 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
4728 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4729 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
4730 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
4731 ; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1
4732 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
4733 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
4734 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
4735 ; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0
4736 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
4737 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
4738 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
4739 ; AVX1-NEXT: vzeroupper
4742 ; AVX2-SLOW-LABEL: trunc_or_v16i64_v16i8:
4743 ; AVX2-SLOW: # %bb.0:
4744 ; AVX2-SLOW-NEXT: vpor %ymm5, %ymm1, %ymm1
4745 ; AVX2-SLOW-NEXT: vpor %ymm4, %ymm0, %ymm0
4746 ; AVX2-SLOW-NEXT: vpor %ymm7, %ymm3, %ymm3
4747 ; AVX2-SLOW-NEXT: vpor %ymm6, %ymm2, %ymm2
4748 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
4749 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
4750 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
4751 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
4752 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
4753 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4754 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
4755 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
4756 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
4757 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
4758 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
4759 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4760 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
4761 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
4762 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
4763 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
4764 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4765 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
4766 ; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
4767 ; AVX2-SLOW-NEXT: vzeroupper
4768 ; AVX2-SLOW-NEXT: retq
4770 ; AVX2-FAST-LABEL: trunc_or_v16i64_v16i8:
4771 ; AVX2-FAST: # %bb.0:
4772 ; AVX2-FAST-NEXT: vpor %ymm5, %ymm1, %ymm1
4773 ; AVX2-FAST-NEXT: vpor %ymm4, %ymm0, %ymm0
4774 ; AVX2-FAST-NEXT: vpor %ymm7, %ymm3, %ymm3
4775 ; AVX2-FAST-NEXT: vpor %ymm6, %ymm2, %ymm2
4776 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
4777 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
4778 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
4779 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
4780 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4781 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
4782 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
4783 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
4784 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2
4785 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
4786 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
4787 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
4788 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
4789 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4790 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0
4791 ; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
4792 ; AVX2-FAST-NEXT: vzeroupper
4793 ; AVX2-FAST-NEXT: retq
4795 ; AVX512-LABEL: trunc_or_v16i64_v16i8:
4797 ; AVX512-NEXT: vporq %zmm3, %zmm1, %zmm1
4798 ; AVX512-NEXT: vporq %zmm2, %zmm0, %zmm0
4799 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
4800 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1
4801 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4802 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
4803 ; AVX512-NEXT: vzeroupper
4805 %1 = or <16 x i64> %a0, %a1
4806 %2 = trunc <16 x i64> %1 to <16 x i8>
4810 define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
4811 ; SSE-LABEL: trunc_or_v16i32_v16i8:
4813 ; SSE-NEXT: por %xmm4, %xmm0
4814 ; SSE-NEXT: por %xmm5, %xmm1
4815 ; SSE-NEXT: por %xmm6, %xmm2
4816 ; SSE-NEXT: por %xmm7, %xmm3
4817 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4818 ; SSE-NEXT: pand %xmm4, %xmm3
4819 ; SSE-NEXT: pand %xmm4, %xmm2
4820 ; SSE-NEXT: packuswb %xmm3, %xmm2
4821 ; SSE-NEXT: pand %xmm4, %xmm1
4822 ; SSE-NEXT: pand %xmm4, %xmm0
4823 ; SSE-NEXT: packuswb %xmm1, %xmm0
4824 ; SSE-NEXT: packuswb %xmm2, %xmm0
4827 ; AVX1-LABEL: trunc_or_v16i32_v16i8:
4829 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
4830 ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
4831 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
4832 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
4833 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
4834 ; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
4835 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
4836 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
4837 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
4838 ; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
4839 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
4840 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4841 ; AVX1-NEXT: vzeroupper
4844 ; AVX2-LABEL: trunc_or_v16i32_v16i8:
4846 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
4847 ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
4848 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4849 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
4850 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
4851 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
4852 ; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
4853 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
4854 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4855 ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
4856 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4857 ; AVX2-NEXT: vzeroupper
4860 ; AVX512-LABEL: trunc_or_v16i32_v16i8:
4862 ; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0
4863 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
4864 ; AVX512-NEXT: vzeroupper
4866 %1 = or <16 x i32> %a0, %a1
4867 %2 = trunc <16 x i32> %1 to <16 x i8>
4871 define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
4872 ; SSE-LABEL: trunc_or_v16i16_v16i8:
4874 ; SSE-NEXT: por %xmm2, %xmm0
4875 ; SSE-NEXT: por %xmm3, %xmm1
4876 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
4877 ; SSE-NEXT: pand %xmm2, %xmm1
4878 ; SSE-NEXT: pand %xmm2, %xmm0
4879 ; SSE-NEXT: packuswb %xmm1, %xmm0
4882 ; AVX1-LABEL: trunc_or_v16i16_v16i8:
4884 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
4885 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
4886 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4887 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4888 ; AVX1-NEXT: vzeroupper
4891 ; AVX2-LABEL: trunc_or_v16i16_v16i8:
4893 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
4894 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
4895 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4896 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4897 ; AVX2-NEXT: vzeroupper
4900 ; AVX512F-LABEL: trunc_or_v16i16_v16i8:
4902 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
4903 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4904 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
4905 ; AVX512F-NEXT: vzeroupper
4906 ; AVX512F-NEXT: retq
4908 ; AVX512BW-LABEL: trunc_or_v16i16_v16i8:
4909 ; AVX512BW: # %bb.0:
4910 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
4911 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
4912 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
4913 ; AVX512BW-NEXT: vzeroupper
4914 ; AVX512BW-NEXT: retq
4916 ; AVX512DQ-LABEL: trunc_or_v16i16_v16i8:
4917 ; AVX512DQ: # %bb.0:
4918 ; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0
4919 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4920 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
4921 ; AVX512DQ-NEXT: vzeroupper
4922 ; AVX512DQ-NEXT: retq
4923 %1 = or <16 x i16> %a0, %a1
4924 %2 = trunc <16 x i16> %1 to <16 x i8>
4932 define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
4933 ; SSE-LABEL: trunc_or_const_v4i64_v4i32:
4935 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4936 ; SSE-NEXT: orps {{.*}}(%rip), %xmm0
4939 ; AVX1-LABEL: trunc_or_const_v4i64_v4i32:
4941 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4942 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4943 ; AVX1-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0
4944 ; AVX1-NEXT: vzeroupper
4947 ; AVX2-SLOW-LABEL: trunc_or_const_v4i64_v4i32:
4948 ; AVX2-SLOW: # %bb.0:
4949 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
4950 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
4951 ; AVX2-SLOW-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0
4952 ; AVX2-SLOW-NEXT: vzeroupper
4953 ; AVX2-SLOW-NEXT: retq
4955 ; AVX2-FAST-LABEL: trunc_or_const_v4i64_v4i32:
4956 ; AVX2-FAST: # %bb.0:
4957 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
4958 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
4959 ; AVX2-FAST-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0
4960 ; AVX2-FAST-NEXT: vzeroupper
4961 ; AVX2-FAST-NEXT: retq
4963 ; AVX512-LABEL: trunc_or_const_v4i64_v4i32:
4965 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
4966 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
4967 ; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
4968 ; AVX512-NEXT: vzeroupper
4970 %1 = or <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
4971 %2 = trunc <4 x i64> %1 to <4 x i32>
4975 define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
4976 ; SSE-LABEL: trunc_or_const_v8i64_v8i16:
4978 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4979 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
4980 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4981 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
4982 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
4983 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
4984 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
4985 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
4986 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
4987 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4988 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
4989 ; SSE-NEXT: orpd {{.*}}(%rip), %xmm0
4992 ; AVX1-LABEL: trunc_or_const_v8i64_v8i16:
4994 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
4995 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
4996 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
4997 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
4998 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
4999 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
5000 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
5001 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
5002 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
5003 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
5004 ; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
5005 ; AVX1-NEXT: vzeroupper
5008 ; AVX2-SLOW-LABEL: trunc_or_const_v8i64_v8i16:
5009 ; AVX2-SLOW: # %bb.0:
5010 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
5011 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
5012 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
5013 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
5014 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
5015 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
5016 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
5017 ; AVX2-SLOW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
5018 ; AVX2-SLOW-NEXT: vzeroupper
5019 ; AVX2-SLOW-NEXT: retq
5021 ; AVX2-FAST-LABEL: trunc_or_const_v8i64_v8i16:
5022 ; AVX2-FAST: # %bb.0:
5023 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
5024 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
5025 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
5026 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
5027 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
5028 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
5029 ; AVX2-FAST-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
5030 ; AVX2-FAST-NEXT: vzeroupper
5031 ; AVX2-FAST-NEXT: retq
5033 ; AVX512-LABEL: trunc_or_const_v8i64_v8i16:
5035 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
5036 ; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
5037 ; AVX512-NEXT: vzeroupper
5039 %1 = or <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
5040 %2 = trunc <8 x i64> %1 to <8 x i16>
5044 define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
5045 ; SSE-LABEL: trunc_or_const_v8i32_v8i16:
5047 ; SSE-NEXT: pslld $16, %xmm1
5048 ; SSE-NEXT: psrad $16, %xmm1
5049 ; SSE-NEXT: pslld $16, %xmm0
5050 ; SSE-NEXT: psrad $16, %xmm0
5051 ; SSE-NEXT: packssdw %xmm1, %xmm0
5052 ; SSE-NEXT: por {{.*}}(%rip), %xmm0
5055 ; AVX1-LABEL: trunc_or_const_v8i32_v8i16:
5057 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
5058 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
5059 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
5060 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
5061 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
5062 ; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
5063 ; AVX1-NEXT: vzeroupper
5066 ; AVX2-LABEL: trunc_or_const_v8i32_v8i16:
5068 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
5069 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
5070 ; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
5071 ; AVX2-NEXT: vzeroupper
5074 ; AVX512-LABEL: trunc_or_const_v8i32_v8i16:
5076 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
5077 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
5078 ; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
5079 ; AVX512-NEXT: vzeroupper
5081 %1 = or <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
5082 %2 = trunc <8 x i32> %1 to <8 x i16>
5086 define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
5087 ; SSE-LABEL: trunc_or_const_v16i64_v16i8:
5089 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
5090 ; SSE-NEXT: pand %xmm8, %xmm7
5091 ; SSE-NEXT: pand %xmm8, %xmm6
5092 ; SSE-NEXT: packuswb %xmm7, %xmm6
5093 ; SSE-NEXT: pand %xmm8, %xmm5
5094 ; SSE-NEXT: pand %xmm8, %xmm4
5095 ; SSE-NEXT: packuswb %xmm5, %xmm4
5096 ; SSE-NEXT: packuswb %xmm6, %xmm4
5097 ; SSE-NEXT: pand %xmm8, %xmm3
5098 ; SSE-NEXT: pand %xmm8, %xmm2
5099 ; SSE-NEXT: packuswb %xmm3, %xmm2
5100 ; SSE-NEXT: pand %xmm8, %xmm1
5101 ; SSE-NEXT: pand %xmm8, %xmm0
5102 ; SSE-NEXT: packuswb %xmm1, %xmm0
5103 ; SSE-NEXT: packuswb %xmm2, %xmm0
5104 ; SSE-NEXT: packuswb %xmm4, %xmm0
5105 ; SSE-NEXT: por {{.*}}(%rip), %xmm0
5108 ; AVX1-LABEL: trunc_or_const_v16i64_v16i8:
5110 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
5111 ; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321]
5112 ; AVX1-NEXT: # xmm5 = mem[0,0]
5113 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
5114 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
5115 ; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
5116 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
5117 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
5118 ; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2
5119 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
5120 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
5121 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
5122 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
5123 ; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1
5124 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
5125 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
5126 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
5127 ; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0
5128 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
5129 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
5130 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
5131 ; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
5132 ; AVX1-NEXT: vzeroupper
5135 ; AVX2-SLOW-LABEL: trunc_or_const_v16i64_v16i8:
5136 ; AVX2-SLOW: # %bb.0:
5137 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
5138 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
5139 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
5140 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
5141 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
5142 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
5143 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
5144 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
5145 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
5146 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
5147 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
5148 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
5149 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
5150 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
5151 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
5152 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
5153 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
5154 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
5155 ; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
5156 ; AVX2-SLOW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
5157 ; AVX2-SLOW-NEXT: vzeroupper
5158 ; AVX2-SLOW-NEXT: retq
5160 ; AVX2-FAST-LABEL: trunc_or_const_v16i64_v16i8:
5161 ; AVX2-FAST: # %bb.0:
5162 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
5163 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
5164 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
5165 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
5166 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
5167 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
5168 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
5169 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
5170 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2
5171 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
5172 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
5173 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
5174 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
5175 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
5176 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0
5177 ; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
5178 ; AVX2-FAST-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
5179 ; AVX2-FAST-NEXT: vzeroupper
5180 ; AVX2-FAST-NEXT: retq
5182 ; AVX512-LABEL: trunc_or_const_v16i64_v16i8:
5184 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
5185 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1
5186 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
5187 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
5188 ; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
5189 ; AVX512-NEXT: vzeroupper
5191 %1 = or <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
5192 %2 = trunc <16 x i64> %1 to <16 x i8>
5196 define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
5197 ; SSE-LABEL: trunc_or_const_v16i32_v16i8:
5199 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
5200 ; SSE-NEXT: pand %xmm4, %xmm3
5201 ; SSE-NEXT: pand %xmm4, %xmm2
5202 ; SSE-NEXT: packuswb %xmm3, %xmm2
5203 ; SSE-NEXT: pand %xmm4, %xmm1
5204 ; SSE-NEXT: pand %xmm4, %xmm0
5205 ; SSE-NEXT: packuswb %xmm1, %xmm0
5206 ; SSE-NEXT: packuswb %xmm2, %xmm0
5207 ; SSE-NEXT: por {{.*}}(%rip), %xmm0
5210 ; AVX1-LABEL: trunc_or_const_v16i32_v16i8:
5212 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
5213 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
5214 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
5215 ; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
5216 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
5217 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
5218 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
5219 ; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
5220 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
5221 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
5222 ; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
5223 ; AVX1-NEXT: vzeroupper
5226 ; AVX2-LABEL: trunc_or_const_v16i32_v16i8:
5228 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
5229 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
5230 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
5231 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
5232 ; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
5233 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
5234 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
5235 ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
5236 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
5237 ; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
5238 ; AVX2-NEXT: vzeroupper
5241 ; AVX512-LABEL: trunc_or_const_v16i32_v16i8:
5243 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
5244 ; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
5245 ; AVX512-NEXT: vzeroupper
5247 %1 = or <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
5248 %2 = trunc <16 x i32> %1 to <16 x i8>
5252 define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
5253 ; SSE-LABEL: trunc_or_const_v16i16_v16i8:
5255 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
5256 ; SSE-NEXT: pand %xmm2, %xmm1
5257 ; SSE-NEXT: pand %xmm2, %xmm0
5258 ; SSE-NEXT: packuswb %xmm1, %xmm0
5259 ; SSE-NEXT: por {{.*}}(%rip), %xmm0
5262 ; AVX1-LABEL: trunc_or_const_v16i16_v16i8:
5264 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
5265 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
5266 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
5267 ; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
5268 ; AVX1-NEXT: vzeroupper
5271 ; AVX2-LABEL: trunc_or_const_v16i16_v16i8:
5273 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
5274 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
5275 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
5276 ; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
5277 ; AVX2-NEXT: vzeroupper
5280 ; AVX512F-LABEL: trunc_or_const_v16i16_v16i8:
5282 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
5283 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
5284 ; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
5285 ; AVX512F-NEXT: vzeroupper
5286 ; AVX512F-NEXT: retq
5288 ; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8:
5289 ; AVX512BW: # %bb.0:
5290 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
5291 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
5292 ; AVX512BW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
5293 ; AVX512BW-NEXT: vzeroupper
5294 ; AVX512BW-NEXT: retq
5296 ; AVX512DQ-LABEL: trunc_or_const_v16i16_v16i8:
5297 ; AVX512DQ: # %bb.0:
5298 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
5299 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
5300 ; AVX512DQ-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
5301 ; AVX512DQ-NEXT: vzeroupper
5302 ; AVX512DQ-NEXT: retq
5303 %1 = or <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
5304 %2 = trunc <16 x i16> %1 to <16 x i8>
5309 ; complex patterns - often created by vectorizer
5312 define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
5313 ; SSE-LABEL: mul_add_const_v4i64_v4i32:
5315 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
5316 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
5317 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
5318 ; SSE-NEXT: pmuludq %xmm2, %xmm0
5319 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
5320 ; SSE-NEXT: pmuludq %xmm3, %xmm1
5321 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
5322 ; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
5325 ; AVX-LABEL: mul_add_const_v4i64_v4i32:
5327 ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
5328 ; AVX-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
5330 %1 = sext <4 x i32> %a0 to <4 x i64>
5331 %2 = sext <4 x i32> %a1 to <4 x i64>
5332 %3 = mul <4 x i64> %1, %2
5333 %4 = add <4 x i64> %3, <i64 -3, i64 -1, i64 1, i64 3>
5334 %5 = trunc <4 x i64> %4 to <4 x i32>
5338 define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
5339 ; SSE-LABEL: mul_add_self_v4i64_v4i32:
5341 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
5342 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
5343 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
5344 ; SSE-NEXT: pmuludq %xmm2, %xmm0
5345 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
5346 ; SSE-NEXT: pmuludq %xmm3, %xmm1
5347 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
5348 ; SSE-NEXT: paddd %xmm0, %xmm0
5351 ; AVX-LABEL: mul_add_self_v4i64_v4i32:
5353 ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
5354 ; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0
5356 %1 = sext <4 x i32> %a0 to <4 x i64>
5357 %2 = sext <4 x i32> %a1 to <4 x i64>
5358 %3 = mul <4 x i64> %1, %2
5359 %4 = add <4 x i64> %3, %3
5360 %5 = trunc <4 x i64> %4 to <4 x i32>
5364 define <4 x i32> @mul_add_multiuse_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
5365 ; SSE-LABEL: mul_add_multiuse_v4i64_v4i32:
5367 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
5368 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
5369 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3]
5370 ; SSE-NEXT: pmuludq %xmm2, %xmm4
5371 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
5372 ; SSE-NEXT: pmuludq %xmm3, %xmm1
5373 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[0,2]
5374 ; SSE-NEXT: paddd %xmm4, %xmm0
5377 ; AVX-LABEL: mul_add_multiuse_v4i64_v4i32:
5379 ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm1
5380 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
5382 %1 = sext <4 x i32> %a0 to <4 x i64>
5383 %2 = sext <4 x i32> %a1 to <4 x i64>
5384 %3 = mul <4 x i64> %1, %2
5385 %4 = add <4 x i64> %1, %3
5386 %5 = trunc <4 x i64> %4 to <4 x i32>