1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-ALL
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-PERLANE
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ
17 define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
18 ; SSE-LABEL: trunc_add_v4i64_v4i32:
20 ; SSE-NEXT: paddq %xmm3, %xmm1
21 ; SSE-NEXT: paddq %xmm2, %xmm0
22 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
25 ; AVX1-LABEL: trunc_add_v4i64_v4i32:
27 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
28 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
29 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
30 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
31 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
32 ; AVX1-NEXT: vzeroupper
35 ; AVX2-SLOW-LABEL: trunc_add_v4i64_v4i32:
37 ; AVX2-SLOW-NEXT: vpaddq %ymm1, %ymm0, %ymm0
38 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
39 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
40 ; AVX2-SLOW-NEXT: vzeroupper
41 ; AVX2-SLOW-NEXT: retq
43 ; AVX2-FAST-ALL-LABEL: trunc_add_v4i64_v4i32:
44 ; AVX2-FAST-ALL: # %bb.0:
45 ; AVX2-FAST-ALL-NEXT: vpaddq %ymm1, %ymm0, %ymm0
46 ; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0]
47 ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
48 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
49 ; AVX2-FAST-ALL-NEXT: vzeroupper
50 ; AVX2-FAST-ALL-NEXT: retq
52 ; AVX2-FAST-PERLANE-LABEL: trunc_add_v4i64_v4i32:
53 ; AVX2-FAST-PERLANE: # %bb.0:
54 ; AVX2-FAST-PERLANE-NEXT: vpaddq %ymm1, %ymm0, %ymm0
55 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1
56 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
57 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
58 ; AVX2-FAST-PERLANE-NEXT: retq
60 ; AVX512-LABEL: trunc_add_v4i64_v4i32:
62 ; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0
63 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
64 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
65 ; AVX512-NEXT: vzeroupper
67 %1 = add <4 x i64> %a0, %a1
68 %2 = trunc <4 x i64> %1 to <4 x i32>
72 define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
73 ; SSE-LABEL: trunc_add_v8i64_v8i16:
75 ; SSE-NEXT: paddq %xmm5, %xmm1
76 ; SSE-NEXT: paddq %xmm4, %xmm0
77 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
78 ; SSE-NEXT: paddq %xmm7, %xmm3
79 ; SSE-NEXT: paddq %xmm6, %xmm2
80 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
81 ; SSE-NEXT: pslld $16, %xmm2
82 ; SSE-NEXT: psrad $16, %xmm2
83 ; SSE-NEXT: pslld $16, %xmm0
84 ; SSE-NEXT: psrad $16, %xmm0
85 ; SSE-NEXT: packssdw %xmm2, %xmm0
88 ; AVX1-LABEL: trunc_add_v8i64_v8i16:
90 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4
91 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
92 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
93 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
94 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm2
95 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
96 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
97 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
98 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
99 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
100 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
101 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
102 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
103 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
104 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
105 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
106 ; AVX1-NEXT: vzeroupper
109 ; AVX2-LABEL: trunc_add_v8i64_v8i16:
111 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
112 ; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
113 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
114 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
115 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
116 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
117 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
118 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
119 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
120 ; AVX2-NEXT: vzeroupper
123 ; AVX512-LABEL: trunc_add_v8i64_v8i16:
125 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
126 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
127 ; AVX512-NEXT: vzeroupper
129 %1 = add <8 x i64> %a0, %a1
130 %2 = trunc <8 x i64> %1 to <8 x i16>
134 define <8 x i16> @trunc_add_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
135 ; SSE-LABEL: trunc_add_v8i32_v8i16:
137 ; SSE-NEXT: paddd %xmm2, %xmm0
138 ; SSE-NEXT: paddd %xmm3, %xmm1
139 ; SSE-NEXT: pslld $16, %xmm1
140 ; SSE-NEXT: psrad $16, %xmm1
141 ; SSE-NEXT: pslld $16, %xmm0
142 ; SSE-NEXT: psrad $16, %xmm0
143 ; SSE-NEXT: packssdw %xmm1, %xmm0
146 ; AVX1-LABEL: trunc_add_v8i32_v8i16:
148 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2
149 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
150 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
151 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
152 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
153 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
154 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
155 ; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
156 ; AVX1-NEXT: vzeroupper
159 ; AVX2-LABEL: trunc_add_v8i32_v8i16:
161 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
162 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
163 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
164 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
165 ; AVX2-NEXT: vzeroupper
168 ; AVX512-LABEL: trunc_add_v8i32_v8i16:
170 ; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
171 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
172 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
173 ; AVX512-NEXT: vzeroupper
175 %1 = add <8 x i32> %a0, %a1
176 %2 = trunc <8 x i32> %1 to <8 x i16>
180 define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
181 ; SSE-LABEL: trunc_add_v16i64_v16i8:
183 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm0
184 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm1
185 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm2
186 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm3
187 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm4
188 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm5
189 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm6
190 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm7
191 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
192 ; SSE-NEXT: pand %xmm8, %xmm7
193 ; SSE-NEXT: pand %xmm8, %xmm6
194 ; SSE-NEXT: packuswb %xmm7, %xmm6
195 ; SSE-NEXT: pand %xmm8, %xmm5
196 ; SSE-NEXT: pand %xmm8, %xmm4
197 ; SSE-NEXT: packuswb %xmm5, %xmm4
198 ; SSE-NEXT: packuswb %xmm6, %xmm4
199 ; SSE-NEXT: pand %xmm8, %xmm3
200 ; SSE-NEXT: pand %xmm8, %xmm2
201 ; SSE-NEXT: packuswb %xmm3, %xmm2
202 ; SSE-NEXT: pand %xmm8, %xmm1
203 ; SSE-NEXT: pand %xmm8, %xmm0
204 ; SSE-NEXT: packuswb %xmm1, %xmm0
205 ; SSE-NEXT: packuswb %xmm2, %xmm0
206 ; SSE-NEXT: packuswb %xmm4, %xmm0
209 ; AVX1-LABEL: trunc_add_v16i64_v16i8:
211 ; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm8
212 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
213 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
214 ; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0
215 ; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm4
216 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
217 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
218 ; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1
219 ; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm5
220 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6
221 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
222 ; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm2
223 ; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm6
224 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7
225 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
226 ; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm3
227 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm7 = [255,255]
228 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
229 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
230 ; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3
231 ; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
232 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
233 ; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2
234 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
235 ; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1
236 ; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3
237 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
238 ; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0
239 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3
240 ; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0
241 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
242 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
243 ; AVX1-NEXT: vzeroupper
246 ; AVX2-LABEL: trunc_add_v16i64_v16i8:
248 ; AVX2-NEXT: vpaddq %ymm4, %ymm0, %ymm0
249 ; AVX2-NEXT: vpaddq %ymm5, %ymm1, %ymm1
250 ; AVX2-NEXT: vpaddq %ymm6, %ymm2, %ymm2
251 ; AVX2-NEXT: vpaddq %ymm7, %ymm3, %ymm3
252 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
253 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
254 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
255 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
256 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
257 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
258 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
259 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
260 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
261 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
262 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
263 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
264 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
265 ; AVX2-NEXT: vzeroupper
268 ; AVX512-LABEL: trunc_add_v16i64_v16i8:
270 ; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0
271 ; AVX512-NEXT: vpaddq %zmm3, %zmm1, %zmm1
272 ; AVX512-NEXT: vpmovqb %zmm1, %xmm1
273 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
274 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
275 ; AVX512-NEXT: vzeroupper
277 %1 = add <16 x i64> %a0, %a1
278 %2 = trunc <16 x i64> %1 to <16 x i8>
282 define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
283 ; SSE-LABEL: trunc_add_v16i32_v16i8:
285 ; SSE-NEXT: paddd %xmm4, %xmm0
286 ; SSE-NEXT: paddd %xmm5, %xmm1
287 ; SSE-NEXT: paddd %xmm6, %xmm2
288 ; SSE-NEXT: paddd %xmm7, %xmm3
289 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
290 ; SSE-NEXT: pand %xmm4, %xmm3
291 ; SSE-NEXT: pand %xmm4, %xmm2
292 ; SSE-NEXT: packuswb %xmm3, %xmm2
293 ; SSE-NEXT: pand %xmm4, %xmm1
294 ; SSE-NEXT: pand %xmm4, %xmm0
295 ; SSE-NEXT: packuswb %xmm1, %xmm0
296 ; SSE-NEXT: packuswb %xmm2, %xmm0
299 ; AVX1-LABEL: trunc_add_v16i32_v16i8:
301 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4
302 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
303 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
304 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
305 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm2
306 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
307 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
308 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
309 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255]
310 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
311 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
312 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
313 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
314 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2
315 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
316 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
317 ; AVX1-NEXT: vzeroupper
320 ; AVX2-LABEL: trunc_add_v16i32_v16i8:
322 ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
323 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1
324 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
325 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
326 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
327 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
328 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
329 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
330 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
331 ; AVX2-NEXT: vzeroupper
334 ; AVX512-LABEL: trunc_add_v16i32_v16i8:
336 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
337 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
338 ; AVX512-NEXT: vzeroupper
340 %1 = add <16 x i32> %a0, %a1
341 %2 = trunc <16 x i32> %1 to <16 x i8>
345 define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
346 ; SSE-LABEL: trunc_add_v16i16_v16i8:
348 ; SSE-NEXT: paddw %xmm2, %xmm0
349 ; SSE-NEXT: paddw %xmm3, %xmm1
350 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
351 ; SSE-NEXT: pand %xmm2, %xmm1
352 ; SSE-NEXT: pand %xmm2, %xmm0
353 ; SSE-NEXT: packuswb %xmm1, %xmm0
356 ; AVX1-LABEL: trunc_add_v16i16_v16i8:
358 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2
359 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
360 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
361 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
362 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
363 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
364 ; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1
365 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
366 ; AVX1-NEXT: vzeroupper
369 ; AVX2-LABEL: trunc_add_v16i16_v16i8:
371 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
372 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
373 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
374 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
375 ; AVX2-NEXT: vzeroupper
378 ; AVX512F-LABEL: trunc_add_v16i16_v16i8:
380 ; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm0
381 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
382 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
383 ; AVX512F-NEXT: vzeroupper
386 ; AVX512BW-LABEL: trunc_add_v16i16_v16i8:
388 ; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm0
389 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
390 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
391 ; AVX512BW-NEXT: vzeroupper
392 ; AVX512BW-NEXT: retq
394 ; AVX512DQ-LABEL: trunc_add_v16i16_v16i8:
396 ; AVX512DQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0
397 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
398 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
399 ; AVX512DQ-NEXT: vzeroupper
400 ; AVX512DQ-NEXT: retq
401 %1 = add <16 x i16> %a0, %a1
402 %2 = trunc <16 x i16> %1 to <16 x i8>
406 define <8 x i16> @trunc_add_v8i32_v8i16_sext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
407 ; SSE-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
409 ; SSE-NEXT: pslld $16, %xmm2
410 ; SSE-NEXT: psrad $16, %xmm2
411 ; SSE-NEXT: pslld $16, %xmm1
412 ; SSE-NEXT: psrad $16, %xmm1
413 ; SSE-NEXT: packssdw %xmm2, %xmm1
414 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
415 ; SSE-NEXT: psraw $8, %xmm0
416 ; SSE-NEXT: paddw %xmm1, %xmm0
419 ; AVX1-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
421 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
422 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
423 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
424 ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0
425 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
426 ; AVX1-NEXT: vzeroupper
429 ; AVX2-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
431 ; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0
432 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
433 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
434 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
435 ; AVX2-NEXT: vzeroupper
438 ; AVX512-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
440 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
441 ; AVX512-NEXT: vpmovdw %zmm1, %ymm1
442 ; AVX512-NEXT: vpmovsxbw %xmm0, %xmm0
443 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
444 ; AVX512-NEXT: vzeroupper
446 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
447 %2 = sext <8 x i8> %1 to <8 x i32>
448 %3 = add <8 x i32> %2, %a1
449 %4 = trunc <8 x i32> %3 to <8 x i16>
457 define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
458 ; SSE-LABEL: trunc_add_const_v4i64_v4i32:
460 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
461 ; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
464 ; AVX1-LABEL: trunc_add_const_v4i64_v4i32:
466 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
467 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
468 ; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
469 ; AVX1-NEXT: vzeroupper
472 ; AVX2-SLOW-LABEL: trunc_add_const_v4i64_v4i32:
473 ; AVX2-SLOW: # %bb.0:
474 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
475 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
476 ; AVX2-SLOW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
477 ; AVX2-SLOW-NEXT: vzeroupper
478 ; AVX2-SLOW-NEXT: retq
480 ; AVX2-FAST-ALL-LABEL: trunc_add_const_v4i64_v4i32:
481 ; AVX2-FAST-ALL: # %bb.0:
482 ; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0]
483 ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
484 ; AVX2-FAST-ALL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
485 ; AVX2-FAST-ALL-NEXT: vzeroupper
486 ; AVX2-FAST-ALL-NEXT: retq
488 ; AVX2-FAST-PERLANE-LABEL: trunc_add_const_v4i64_v4i32:
489 ; AVX2-FAST-PERLANE: # %bb.0:
490 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
491 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
492 ; AVX2-FAST-PERLANE-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
493 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
494 ; AVX2-FAST-PERLANE-NEXT: retq
496 ; AVX512-LABEL: trunc_add_const_v4i64_v4i32:
498 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
499 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
500 ; AVX512-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
501 ; AVX512-NEXT: vzeroupper
503 %1 = add <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
504 %2 = trunc <4 x i64> %1 to <4 x i32>
508 define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
509 ; SSE-LABEL: trunc_add_const_v8i64_v8i16:
511 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
512 ; SSE-NEXT: pslld $16, %xmm2
513 ; SSE-NEXT: psrad $16, %xmm2
514 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
515 ; SSE-NEXT: pslld $16, %xmm0
516 ; SSE-NEXT: psrad $16, %xmm0
517 ; SSE-NEXT: packssdw %xmm2, %xmm0
518 ; SSE-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
521 ; AVX1-LABEL: trunc_add_const_v8i64_v8i16:
523 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
524 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
525 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
526 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
527 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
528 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
529 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
530 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
531 ; AVX1-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
532 ; AVX1-NEXT: vzeroupper
535 ; AVX2-LABEL: trunc_add_const_v8i64_v8i16:
537 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
538 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
539 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
540 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
541 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
542 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
543 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
544 ; AVX2-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
545 ; AVX2-NEXT: vzeroupper
548 ; AVX512-LABEL: trunc_add_const_v8i64_v8i16:
550 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
551 ; AVX512-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
552 ; AVX512-NEXT: vzeroupper
554 %1 = add <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
555 %2 = trunc <8 x i64> %1 to <8 x i16>
559 define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
560 ; SSE-LABEL: trunc_add_const_v8i32_v8i16:
562 ; SSE-NEXT: pslld $16, %xmm1
563 ; SSE-NEXT: psrad $16, %xmm1
564 ; SSE-NEXT: pslld $16, %xmm0
565 ; SSE-NEXT: psrad $16, %xmm0
566 ; SSE-NEXT: packssdw %xmm1, %xmm0
567 ; SSE-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
570 ; AVX1-LABEL: trunc_add_const_v8i32_v8i16:
572 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
573 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
574 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
575 ; AVX1-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
576 ; AVX1-NEXT: vzeroupper
579 ; AVX2-LABEL: trunc_add_const_v8i32_v8i16:
581 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
582 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
583 ; AVX2-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
584 ; AVX2-NEXT: vzeroupper
587 ; AVX512-LABEL: trunc_add_const_v8i32_v8i16:
589 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
590 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
591 ; AVX512-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
592 ; AVX512-NEXT: vzeroupper
594 %1 = add <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
595 %2 = trunc <8 x i32> %1 to <8 x i16>
599 define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
600 ; SSE-LABEL: trunc_add_const_v16i64_v16i8:
602 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
603 ; SSE-NEXT: pand %xmm8, %xmm7
604 ; SSE-NEXT: pand %xmm8, %xmm6
605 ; SSE-NEXT: packuswb %xmm7, %xmm6
606 ; SSE-NEXT: pand %xmm8, %xmm5
607 ; SSE-NEXT: pand %xmm8, %xmm4
608 ; SSE-NEXT: packuswb %xmm5, %xmm4
609 ; SSE-NEXT: packuswb %xmm6, %xmm4
610 ; SSE-NEXT: pand %xmm8, %xmm3
611 ; SSE-NEXT: pand %xmm8, %xmm2
612 ; SSE-NEXT: packuswb %xmm3, %xmm2
613 ; SSE-NEXT: pand %xmm8, %xmm1
614 ; SSE-NEXT: pand %xmm8, %xmm0
615 ; SSE-NEXT: packuswb %xmm1, %xmm0
616 ; SSE-NEXT: packuswb %xmm2, %xmm0
617 ; SSE-NEXT: packuswb %xmm4, %xmm0
618 ; SSE-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
621 ; AVX1-LABEL: trunc_add_const_v16i64_v16i8:
623 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255]
624 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
625 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
626 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
627 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
628 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
629 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
630 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
631 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
632 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
633 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
634 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
635 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
636 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
637 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
638 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
639 ; AVX1-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
640 ; AVX1-NEXT: vzeroupper
643 ; AVX2-LABEL: trunc_add_const_v16i64_v16i8:
645 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
646 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
647 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
648 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
649 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
650 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
651 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
652 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
653 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
654 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
655 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
656 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
657 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
658 ; AVX2-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
659 ; AVX2-NEXT: vzeroupper
662 ; AVX512-LABEL: trunc_add_const_v16i64_v16i8:
664 ; AVX512-NEXT: vpmovqb %zmm1, %xmm1
665 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
666 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
667 ; AVX512-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
668 ; AVX512-NEXT: vzeroupper
670 %1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
671 %2 = trunc <16 x i64> %1 to <16 x i8>
675 define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
676 ; SSE-LABEL: trunc_add_const_v16i32_v16i8:
678 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
679 ; SSE-NEXT: pand %xmm4, %xmm3
680 ; SSE-NEXT: pand %xmm4, %xmm2
681 ; SSE-NEXT: packuswb %xmm3, %xmm2
682 ; SSE-NEXT: pand %xmm4, %xmm1
683 ; SSE-NEXT: pand %xmm4, %xmm0
684 ; SSE-NEXT: packuswb %xmm1, %xmm0
685 ; SSE-NEXT: packuswb %xmm2, %xmm0
686 ; SSE-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
689 ; AVX1-LABEL: trunc_add_const_v16i32_v16i8:
691 ; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
692 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
693 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
694 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
695 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
696 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
697 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
698 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
699 ; AVX1-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
700 ; AVX1-NEXT: vzeroupper
703 ; AVX2-LABEL: trunc_add_const_v16i32_v16i8:
705 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
706 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
707 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
708 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
709 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
710 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
711 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
712 ; AVX2-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
713 ; AVX2-NEXT: vzeroupper
716 ; AVX512-LABEL: trunc_add_const_v16i32_v16i8:
718 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
719 ; AVX512-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
720 ; AVX512-NEXT: vzeroupper
722 %1 = add <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
723 %2 = trunc <16 x i32> %1 to <16 x i8>
727 define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
728 ; SSE-LABEL: trunc_add_const_v16i16_v16i8:
730 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
731 ; SSE-NEXT: pand %xmm2, %xmm1
732 ; SSE-NEXT: pand %xmm2, %xmm0
733 ; SSE-NEXT: packuswb %xmm1, %xmm0
734 ; SSE-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
737 ; AVX1-LABEL: trunc_add_const_v16i16_v16i8:
739 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
740 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
741 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
742 ; AVX1-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
743 ; AVX1-NEXT: vzeroupper
746 ; AVX2-LABEL: trunc_add_const_v16i16_v16i8:
748 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
749 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
750 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
751 ; AVX2-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
752 ; AVX2-NEXT: vzeroupper
755 ; AVX512F-LABEL: trunc_add_const_v16i16_v16i8:
757 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
758 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
759 ; AVX512F-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
760 ; AVX512F-NEXT: vzeroupper
763 ; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8:
765 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
766 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
767 ; AVX512BW-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
768 ; AVX512BW-NEXT: vzeroupper
769 ; AVX512BW-NEXT: retq
771 ; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8:
773 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
774 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
775 ; AVX512DQ-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
776 ; AVX512DQ-NEXT: vzeroupper
777 ; AVX512DQ-NEXT: retq
778 %1 = add <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
779 %2 = trunc <16 x i16> %1 to <16 x i8>
787 define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
788 ; SSE-LABEL: trunc_sub_v4i64_v4i32:
790 ; SSE-NEXT: psubq %xmm3, %xmm1
791 ; SSE-NEXT: psubq %xmm2, %xmm0
792 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
795 ; AVX1-LABEL: trunc_sub_v4i64_v4i32:
797 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
798 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
799 ; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
800 ; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
801 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
802 ; AVX1-NEXT: vzeroupper
805 ; AVX2-SLOW-LABEL: trunc_sub_v4i64_v4i32:
806 ; AVX2-SLOW: # %bb.0:
807 ; AVX2-SLOW-NEXT: vpsubq %ymm1, %ymm0, %ymm0
808 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
809 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
810 ; AVX2-SLOW-NEXT: vzeroupper
811 ; AVX2-SLOW-NEXT: retq
813 ; AVX2-FAST-ALL-LABEL: trunc_sub_v4i64_v4i32:
814 ; AVX2-FAST-ALL: # %bb.0:
815 ; AVX2-FAST-ALL-NEXT: vpsubq %ymm1, %ymm0, %ymm0
816 ; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0]
817 ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
818 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
819 ; AVX2-FAST-ALL-NEXT: vzeroupper
820 ; AVX2-FAST-ALL-NEXT: retq
822 ; AVX2-FAST-PERLANE-LABEL: trunc_sub_v4i64_v4i32:
823 ; AVX2-FAST-PERLANE: # %bb.0:
824 ; AVX2-FAST-PERLANE-NEXT: vpsubq %ymm1, %ymm0, %ymm0
825 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1
826 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
827 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
828 ; AVX2-FAST-PERLANE-NEXT: retq
830 ; AVX512-LABEL: trunc_sub_v4i64_v4i32:
832 ; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0
833 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
834 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
835 ; AVX512-NEXT: vzeroupper
837 %1 = sub <4 x i64> %a0, %a1
838 %2 = trunc <4 x i64> %1 to <4 x i32>
842 define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
843 ; SSE-LABEL: trunc_sub_v8i64_v8i16:
845 ; SSE-NEXT: psubq %xmm5, %xmm1
846 ; SSE-NEXT: psubq %xmm4, %xmm0
847 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
848 ; SSE-NEXT: psubq %xmm7, %xmm3
849 ; SSE-NEXT: psubq %xmm6, %xmm2
850 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
851 ; SSE-NEXT: pslld $16, %xmm2
852 ; SSE-NEXT: psrad $16, %xmm2
853 ; SSE-NEXT: pslld $16, %xmm0
854 ; SSE-NEXT: psrad $16, %xmm0
855 ; SSE-NEXT: packssdw %xmm2, %xmm0
858 ; AVX1-LABEL: trunc_sub_v8i64_v8i16:
860 ; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm4
861 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
862 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
863 ; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
864 ; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm2
865 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
866 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
867 ; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1
868 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
869 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
870 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
871 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
872 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
873 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
874 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
875 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
876 ; AVX1-NEXT: vzeroupper
879 ; AVX2-LABEL: trunc_sub_v8i64_v8i16:
881 ; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
882 ; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm1
883 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
884 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
885 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
886 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
887 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
888 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
889 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
890 ; AVX2-NEXT: vzeroupper
893 ; AVX512-LABEL: trunc_sub_v8i64_v8i16:
895 ; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0
896 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
897 ; AVX512-NEXT: vzeroupper
899 %1 = sub <8 x i64> %a0, %a1
900 %2 = trunc <8 x i64> %1 to <8 x i16>
904 define <8 x i16> @trunc_sub_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
905 ; SSE-LABEL: trunc_sub_v8i32_v8i16:
907 ; SSE-NEXT: psubd %xmm2, %xmm0
908 ; SSE-NEXT: psubd %xmm3, %xmm1
909 ; SSE-NEXT: pslld $16, %xmm1
910 ; SSE-NEXT: psrad $16, %xmm1
911 ; SSE-NEXT: pslld $16, %xmm0
912 ; SSE-NEXT: psrad $16, %xmm0
913 ; SSE-NEXT: packssdw %xmm1, %xmm0
916 ; AVX1-LABEL: trunc_sub_v8i32_v8i16:
918 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2
919 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
920 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
921 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
922 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
923 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
924 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
925 ; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
926 ; AVX1-NEXT: vzeroupper
929 ; AVX2-LABEL: trunc_sub_v8i32_v8i16:
931 ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
932 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
933 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
934 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
935 ; AVX2-NEXT: vzeroupper
938 ; AVX512-LABEL: trunc_sub_v8i32_v8i16:
940 ; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0
941 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
942 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
943 ; AVX512-NEXT: vzeroupper
945 %1 = sub <8 x i32> %a0, %a1
946 %2 = trunc <8 x i32> %1 to <8 x i16>
950 define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
951 ; SSE-LABEL: trunc_sub_v16i64_v16i8:
953 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm0
954 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm1
955 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm2
956 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm3
957 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm4
958 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm5
959 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm6
960 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm7
961 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
962 ; SSE-NEXT: pand %xmm8, %xmm7
963 ; SSE-NEXT: pand %xmm8, %xmm6
964 ; SSE-NEXT: packuswb %xmm7, %xmm6
965 ; SSE-NEXT: pand %xmm8, %xmm5
966 ; SSE-NEXT: pand %xmm8, %xmm4
967 ; SSE-NEXT: packuswb %xmm5, %xmm4
968 ; SSE-NEXT: packuswb %xmm6, %xmm4
969 ; SSE-NEXT: pand %xmm8, %xmm3
970 ; SSE-NEXT: pand %xmm8, %xmm2
971 ; SSE-NEXT: packuswb %xmm3, %xmm2
972 ; SSE-NEXT: pand %xmm8, %xmm1
973 ; SSE-NEXT: pand %xmm8, %xmm0
974 ; SSE-NEXT: packuswb %xmm1, %xmm0
975 ; SSE-NEXT: packuswb %xmm2, %xmm0
976 ; SSE-NEXT: packuswb %xmm4, %xmm0
979 ; AVX1-LABEL: trunc_sub_v16i64_v16i8:
981 ; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm8
982 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
983 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
984 ; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm0
985 ; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm4
986 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
987 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
988 ; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm1
989 ; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm5
990 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6
991 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
992 ; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm2
993 ; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm6
994 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7
995 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
996 ; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm3
997 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm7 = [255,255]
998 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
999 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
1000 ; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3
1001 ; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
1002 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
1003 ; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2
1004 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1005 ; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1
1006 ; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3
1007 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
1008 ; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0
1009 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3
1010 ; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0
1011 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1012 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1013 ; AVX1-NEXT: vzeroupper
1016 ; AVX2-LABEL: trunc_sub_v16i64_v16i8:
1018 ; AVX2-NEXT: vpsubq %ymm4, %ymm0, %ymm0
1019 ; AVX2-NEXT: vpsubq %ymm5, %ymm1, %ymm1
1020 ; AVX2-NEXT: vpsubq %ymm6, %ymm2, %ymm2
1021 ; AVX2-NEXT: vpsubq %ymm7, %ymm3, %ymm3
1022 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1023 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
1024 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
1025 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
1026 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
1027 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
1028 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
1029 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
1030 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1031 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
1032 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1033 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1034 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1035 ; AVX2-NEXT: vzeroupper
1038 ; AVX512-LABEL: trunc_sub_v16i64_v16i8:
1040 ; AVX512-NEXT: vpsubq %zmm2, %zmm0, %zmm0
1041 ; AVX512-NEXT: vpsubq %zmm3, %zmm1, %zmm1
1042 ; AVX512-NEXT: vpmovqb %zmm1, %xmm1
1043 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
1044 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1045 ; AVX512-NEXT: vzeroupper
1047 %1 = sub <16 x i64> %a0, %a1
1048 %2 = trunc <16 x i64> %1 to <16 x i8>
1052 define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
1053 ; SSE-LABEL: trunc_sub_v16i32_v16i8:
1055 ; SSE-NEXT: psubd %xmm4, %xmm0
1056 ; SSE-NEXT: psubd %xmm5, %xmm1
1057 ; SSE-NEXT: psubd %xmm6, %xmm2
1058 ; SSE-NEXT: psubd %xmm7, %xmm3
1059 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1060 ; SSE-NEXT: pand %xmm4, %xmm3
1061 ; SSE-NEXT: pand %xmm4, %xmm2
1062 ; SSE-NEXT: packuswb %xmm3, %xmm2
1063 ; SSE-NEXT: pand %xmm4, %xmm1
1064 ; SSE-NEXT: pand %xmm4, %xmm0
1065 ; SSE-NEXT: packuswb %xmm1, %xmm0
1066 ; SSE-NEXT: packuswb %xmm2, %xmm0
1069 ; AVX1-LABEL: trunc_sub_v16i32_v16i8:
1071 ; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm4
1072 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
1073 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1074 ; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0
1075 ; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm2
1076 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
1077 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1078 ; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1
1079 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255]
1080 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
1081 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
1082 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
1083 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1084 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2
1085 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
1086 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1087 ; AVX1-NEXT: vzeroupper
1090 ; AVX2-LABEL: trunc_sub_v16i32_v16i8:
1092 ; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0
1093 ; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm1
1094 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1095 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
1096 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
1097 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
1098 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1099 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1100 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1101 ; AVX2-NEXT: vzeroupper
1104 ; AVX512-LABEL: trunc_sub_v16i32_v16i8:
1106 ; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0
1107 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
1108 ; AVX512-NEXT: vzeroupper
1110 %1 = sub <16 x i32> %a0, %a1
1111 %2 = trunc <16 x i32> %1 to <16 x i8>
1115 define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
1116 ; SSE-LABEL: trunc_sub_v16i16_v16i8:
1118 ; SSE-NEXT: psubw %xmm2, %xmm0
1119 ; SSE-NEXT: psubw %xmm3, %xmm1
1120 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
1121 ; SSE-NEXT: pand %xmm2, %xmm1
1122 ; SSE-NEXT: pand %xmm2, %xmm0
1123 ; SSE-NEXT: packuswb %xmm1, %xmm0
1126 ; AVX1-LABEL: trunc_sub_v16i16_v16i8:
1128 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2
1129 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1130 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1131 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0
1132 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
1133 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
1134 ; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1
1135 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
1136 ; AVX1-NEXT: vzeroupper
1139 ; AVX2-LABEL: trunc_sub_v16i16_v16i8:
1141 ; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0
1142 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1143 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1144 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1145 ; AVX2-NEXT: vzeroupper
1148 ; AVX512F-LABEL: trunc_sub_v16i16_v16i8:
1150 ; AVX512F-NEXT: vpsubw %ymm1, %ymm0, %ymm0
1151 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1152 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
1153 ; AVX512F-NEXT: vzeroupper
1154 ; AVX512F-NEXT: retq
1156 ; AVX512BW-LABEL: trunc_sub_v16i16_v16i8:
1157 ; AVX512BW: # %bb.0:
1158 ; AVX512BW-NEXT: vpsubw %ymm1, %ymm0, %ymm0
1159 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1160 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1161 ; AVX512BW-NEXT: vzeroupper
1162 ; AVX512BW-NEXT: retq
1164 ; AVX512DQ-LABEL: trunc_sub_v16i16_v16i8:
1165 ; AVX512DQ: # %bb.0:
1166 ; AVX512DQ-NEXT: vpsubw %ymm1, %ymm0, %ymm0
1167 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1168 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
1169 ; AVX512DQ-NEXT: vzeroupper
1170 ; AVX512DQ-NEXT: retq
1171 %1 = sub <16 x i16> %a0, %a1
1172 %2 = trunc <16 x i16> %1 to <16 x i8>
1176 define <16 x i8> @trunc_ext_sub_v16i16_v16i8(<16 x i8> %x, <16 x i8> %y) {
1177 ; SSE-LABEL: trunc_ext_sub_v16i16_v16i8:
1179 ; SSE-NEXT: psubb %xmm1, %xmm0
1182 ; AVX-LABEL: trunc_ext_sub_v16i16_v16i8:
1184 ; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
1186 %a = zext <16 x i8> %x to <16 x i16>
1187 %b = zext <16 x i8> %y to <16 x i16>
1188 %c = sub <16 x i16> %a, %b
1189 %d = trunc <16 x i16> %c to <16 x i8>
1197 define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
1198 ; SSE-LABEL: trunc_sub_const_v4i64_v4i32:
1200 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1201 ; SSE-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1204 ; AVX1-LABEL: trunc_sub_const_v4i64_v4i32:
1206 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1207 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1208 ; AVX1-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1209 ; AVX1-NEXT: vzeroupper
1212 ; AVX2-SLOW-LABEL: trunc_sub_const_v4i64_v4i32:
1213 ; AVX2-SLOW: # %bb.0:
1214 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
1215 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1216 ; AVX2-SLOW-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1217 ; AVX2-SLOW-NEXT: vzeroupper
1218 ; AVX2-SLOW-NEXT: retq
1220 ; AVX2-FAST-ALL-LABEL: trunc_sub_const_v4i64_v4i32:
1221 ; AVX2-FAST-ALL: # %bb.0:
1222 ; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0]
1223 ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
1224 ; AVX2-FAST-ALL-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1225 ; AVX2-FAST-ALL-NEXT: vzeroupper
1226 ; AVX2-FAST-ALL-NEXT: retq
1228 ; AVX2-FAST-PERLANE-LABEL: trunc_sub_const_v4i64_v4i32:
1229 ; AVX2-FAST-PERLANE: # %bb.0:
1230 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
1231 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1232 ; AVX2-FAST-PERLANE-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1233 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
1234 ; AVX2-FAST-PERLANE-NEXT: retq
1236 ; AVX512-LABEL: trunc_sub_const_v4i64_v4i32:
1238 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1239 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
1240 ; AVX512-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1241 ; AVX512-NEXT: vzeroupper
1243 %1 = sub <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
1244 %2 = trunc <4 x i64> %1 to <4 x i32>
1248 define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
1249 ; SSE-LABEL: trunc_sub_const_v8i64_v8i16:
1251 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
1252 ; SSE-NEXT: pslld $16, %xmm2
1253 ; SSE-NEXT: psrad $16, %xmm2
1254 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1255 ; SSE-NEXT: pslld $16, %xmm0
1256 ; SSE-NEXT: psrad $16, %xmm0
1257 ; SSE-NEXT: packssdw %xmm2, %xmm0
1258 ; SSE-NEXT: psubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1261 ; AVX1-LABEL: trunc_sub_const_v8i64_v8i16:
1263 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
1264 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
1265 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1266 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
1267 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
1268 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1269 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
1270 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1271 ; AVX1-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1272 ; AVX1-NEXT: vzeroupper
1275 ; AVX2-LABEL: trunc_sub_const_v8i64_v8i16:
1277 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1278 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
1279 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
1280 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
1281 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1282 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1283 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1284 ; AVX2-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1285 ; AVX2-NEXT: vzeroupper
1288 ; AVX512-LABEL: trunc_sub_const_v8i64_v8i16:
1290 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
1291 ; AVX512-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1292 ; AVX512-NEXT: vzeroupper
1294 %1 = sub <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
1295 %2 = trunc <8 x i64> %1 to <8 x i16>
1299 define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
1300 ; SSE-LABEL: trunc_sub_const_v8i32_v8i16:
1302 ; SSE-NEXT: pslld $16, %xmm1
1303 ; SSE-NEXT: psrad $16, %xmm1
1304 ; SSE-NEXT: pslld $16, %xmm0
1305 ; SSE-NEXT: psrad $16, %xmm0
1306 ; SSE-NEXT: packssdw %xmm1, %xmm0
1307 ; SSE-NEXT: psubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1310 ; AVX1-LABEL: trunc_sub_const_v8i32_v8i16:
1312 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1313 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1314 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1315 ; AVX1-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1316 ; AVX1-NEXT: vzeroupper
1319 ; AVX2-LABEL: trunc_sub_const_v8i32_v8i16:
1321 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
1322 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1323 ; AVX2-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1324 ; AVX2-NEXT: vzeroupper
1327 ; AVX512-LABEL: trunc_sub_const_v8i32_v8i16:
1329 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1330 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
1331 ; AVX512-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1332 ; AVX512-NEXT: vzeroupper
1334 %1 = sub <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1335 %2 = trunc <8 x i32> %1 to <8 x i16>
1339 define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
1340 ; SSE-LABEL: trunc_sub_const_v16i64_v16i8:
1342 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1343 ; SSE-NEXT: pand %xmm8, %xmm7
1344 ; SSE-NEXT: pand %xmm8, %xmm6
1345 ; SSE-NEXT: packuswb %xmm7, %xmm6
1346 ; SSE-NEXT: pand %xmm8, %xmm5
1347 ; SSE-NEXT: pand %xmm8, %xmm4
1348 ; SSE-NEXT: packuswb %xmm5, %xmm4
1349 ; SSE-NEXT: packuswb %xmm6, %xmm4
1350 ; SSE-NEXT: pand %xmm8, %xmm3
1351 ; SSE-NEXT: pand %xmm8, %xmm2
1352 ; SSE-NEXT: packuswb %xmm3, %xmm2
1353 ; SSE-NEXT: pand %xmm8, %xmm1
1354 ; SSE-NEXT: pand %xmm8, %xmm0
1355 ; SSE-NEXT: packuswb %xmm1, %xmm0
1356 ; SSE-NEXT: packuswb %xmm2, %xmm0
1357 ; SSE-NEXT: packuswb %xmm4, %xmm0
1358 ; SSE-NEXT: psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1361 ; AVX1-LABEL: trunc_sub_const_v16i64_v16i8:
1363 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255]
1364 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
1365 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
1366 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
1367 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
1368 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
1369 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
1370 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1371 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
1372 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1373 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
1374 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
1375 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1376 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
1377 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1378 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1379 ; AVX1-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1380 ; AVX1-NEXT: vzeroupper
1383 ; AVX2-LABEL: trunc_sub_const_v16i64_v16i8:
1385 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1386 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
1387 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
1388 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
1389 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
1390 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
1391 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
1392 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
1393 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1394 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
1395 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1396 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1397 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1398 ; AVX2-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1399 ; AVX2-NEXT: vzeroupper
1402 ; AVX512-LABEL: trunc_sub_const_v16i64_v16i8:
1404 ; AVX512-NEXT: vpmovqb %zmm1, %xmm1
1405 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
1406 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1407 ; AVX512-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1408 ; AVX512-NEXT: vzeroupper
1410 %1 = sub <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
1411 %2 = trunc <16 x i64> %1 to <16 x i8>
1415 define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
1416 ; SSE-LABEL: trunc_sub_const_v16i32_v16i8:
1418 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1419 ; SSE-NEXT: pand %xmm4, %xmm3
1420 ; SSE-NEXT: pand %xmm4, %xmm2
1421 ; SSE-NEXT: packuswb %xmm3, %xmm2
1422 ; SSE-NEXT: pand %xmm4, %xmm1
1423 ; SSE-NEXT: pand %xmm4, %xmm0
1424 ; SSE-NEXT: packuswb %xmm1, %xmm0
1425 ; SSE-NEXT: packuswb %xmm2, %xmm0
1426 ; SSE-NEXT: psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1429 ; AVX1-LABEL: trunc_sub_const_v16i32_v16i8:
1431 ; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
1432 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
1433 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1434 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
1435 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
1436 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1437 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
1438 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1439 ; AVX1-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1440 ; AVX1-NEXT: vzeroupper
1443 ; AVX2-LABEL: trunc_sub_const_v16i32_v16i8:
1445 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1446 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
1447 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
1448 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
1449 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1450 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1451 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1452 ; AVX2-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1453 ; AVX2-NEXT: vzeroupper
1456 ; AVX512-LABEL: trunc_sub_const_v16i32_v16i8:
1458 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
1459 ; AVX512-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1460 ; AVX512-NEXT: vzeroupper
1462 %1 = sub <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1463 %2 = trunc <16 x i32> %1 to <16 x i8>
1467 define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
1468 ; SSE-LABEL: trunc_sub_const_v16i16_v16i8:
1470 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
1471 ; SSE-NEXT: pand %xmm2, %xmm1
1472 ; SSE-NEXT: pand %xmm2, %xmm0
1473 ; SSE-NEXT: packuswb %xmm1, %xmm0
1474 ; SSE-NEXT: psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1477 ; AVX1-LABEL: trunc_sub_const_v16i16_v16i8:
1479 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1480 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1481 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1482 ; AVX1-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1483 ; AVX1-NEXT: vzeroupper
1486 ; AVX2-LABEL: trunc_sub_const_v16i16_v16i8:
1488 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1489 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1490 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1491 ; AVX2-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1492 ; AVX2-NEXT: vzeroupper
1495 ; AVX512F-LABEL: trunc_sub_const_v16i16_v16i8:
1497 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1498 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
1499 ; AVX512F-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1500 ; AVX512F-NEXT: vzeroupper
1501 ; AVX512F-NEXT: retq
1503 ; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8:
1504 ; AVX512BW: # %bb.0:
1505 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1506 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1507 ; AVX512BW-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1508 ; AVX512BW-NEXT: vzeroupper
1509 ; AVX512BW-NEXT: retq
1511 ; AVX512DQ-LABEL: trunc_sub_const_v16i16_v16i8:
1512 ; AVX512DQ: # %bb.0:
1513 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1514 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
1515 ; AVX512DQ-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1516 ; AVX512DQ-NEXT: vzeroupper
1517 ; AVX512DQ-NEXT: retq
1518 %1 = sub <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1519 %2 = trunc <16 x i16> %1 to <16 x i8>
1523 define <16 x i8> @trunc_ext_sub_const_rhs_v16i16_v16i8(<16 x i8> %x) {
1524 ; SSE-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
1526 ; SSE-NEXT: psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1529 ; AVX-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
1531 ; AVX-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1533 %a = zext <16 x i8> %x to <16 x i16>
1534 %b = sub <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1535 %c = trunc <16 x i16> %b to <16 x i8>
1539 define <16 x i8> @trunc_ext_sub_const_lhs_v16i16_v16i8(<16 x i8> %x) {
1540 ; SSE-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
1542 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1543 ; SSE-NEXT: psubb %xmm0, %xmm1
1544 ; SSE-NEXT: movdqa %xmm1, %xmm0
1547 ; AVX-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
1549 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1550 ; AVX-NEXT: vpsubb %xmm0, %xmm1, %xmm0
1552 %a = zext <16 x i8> %x to <16 x i16>
1553 %b = sub <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %a
1554 %c = trunc <16 x i16> %b to <16 x i8>
1562 define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
1563 ; SSE-LABEL: trunc_mul_v4i64_v4i32:
1565 ; SSE-NEXT: pmuludq %xmm3, %xmm1
1566 ; SSE-NEXT: pmuludq %xmm2, %xmm0
1567 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1570 ; AVX1-LABEL: trunc_mul_v4i64_v4i32:
1572 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1573 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
1574 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1575 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
1576 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1577 ; AVX1-NEXT: vzeroupper
1580 ; AVX2-SLOW-LABEL: trunc_mul_v4i64_v4i32:
1581 ; AVX2-SLOW: # %bb.0:
1582 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
1583 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
1584 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
1585 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
1586 ; AVX2-SLOW-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1587 ; AVX2-SLOW-NEXT: vzeroupper
1588 ; AVX2-SLOW-NEXT: retq
1590 ; AVX2-FAST-ALL-LABEL: trunc_mul_v4i64_v4i32:
1591 ; AVX2-FAST-ALL: # %bb.0:
1592 ; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
1593 ; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1
1594 ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm2, %ymm0
1595 ; AVX2-FAST-ALL-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1596 ; AVX2-FAST-ALL-NEXT: vzeroupper
1597 ; AVX2-FAST-ALL-NEXT: retq
1599 ; AVX2-FAST-PERLANE-LABEL: trunc_mul_v4i64_v4i32:
1600 ; AVX2-FAST-PERLANE: # %bb.0:
1601 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2
1602 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
1603 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm2
1604 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
1605 ; AVX2-FAST-PERLANE-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1606 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
1607 ; AVX2-FAST-PERLANE-NEXT: retq
1609 ; AVX512F-LABEL: trunc_mul_v4i64_v4i32:
1611 ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1612 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1613 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
1614 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
1615 ; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1616 ; AVX512F-NEXT: vzeroupper
1617 ; AVX512F-NEXT: retq
1619 ; AVX512BW-LABEL: trunc_mul_v4i64_v4i32:
1620 ; AVX512BW: # %bb.0:
1621 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1622 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1623 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
1624 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
1625 ; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1626 ; AVX512BW-NEXT: vzeroupper
1627 ; AVX512BW-NEXT: retq
1629 ; AVX512DQ-LABEL: trunc_mul_v4i64_v4i32:
1630 ; AVX512DQ: # %bb.0:
1631 ; AVX512DQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1632 ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1633 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
1634 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
1635 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1636 ; AVX512DQ-NEXT: vzeroupper
1637 ; AVX512DQ-NEXT: retq
1638 %1 = mul <4 x i64> %a0, %a1
1639 %2 = trunc <4 x i64> %1 to <4 x i32>
1643 define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
1644 ; SSE-LABEL: trunc_mul_v8i64_v8i16:
1646 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2]
1647 ; SSE-NEXT: pslld $16, %xmm6
1648 ; SSE-NEXT: psrad $16, %xmm6
1649 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2]
1650 ; SSE-NEXT: pslld $16, %xmm4
1651 ; SSE-NEXT: psrad $16, %xmm4
1652 ; SSE-NEXT: packssdw %xmm6, %xmm4
1653 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
1654 ; SSE-NEXT: pslld $16, %xmm2
1655 ; SSE-NEXT: psrad $16, %xmm2
1656 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1657 ; SSE-NEXT: pslld $16, %xmm0
1658 ; SSE-NEXT: psrad $16, %xmm0
1659 ; SSE-NEXT: packssdw %xmm2, %xmm0
1660 ; SSE-NEXT: pmullw %xmm4, %xmm0
1663 ; AVX1-LABEL: trunc_mul_v8i64_v8i16:
1665 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [65535,65535,65535,65535]
1666 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
1667 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
1668 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
1669 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
1670 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
1671 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
1672 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1673 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
1674 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1675 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
1676 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
1677 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1678 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
1679 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1680 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
1681 ; AVX1-NEXT: vzeroupper
1684 ; AVX2-LABEL: trunc_mul_v8i64_v8i16:
1686 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
1687 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7],ymm3[8],ymm4[9,10,11],ymm3[12],ymm4[13,14,15]
1688 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7],ymm2[8],ymm4[9,10,11],ymm2[12],ymm4[13,14,15]
1689 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
1690 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
1691 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1692 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1,2,3],ymm1[4],ymm4[5,6,7],ymm1[8],ymm4[9,10,11],ymm1[12],ymm4[13,14,15]
1693 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7],ymm0[8],ymm4[9,10,11],ymm0[12],ymm4[13,14,15]
1694 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
1695 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1696 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1697 ; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0
1698 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1699 ; AVX2-NEXT: vzeroupper
1702 ; AVX512F-LABEL: trunc_mul_v8i64_v8i16:
1704 ; AVX512F-NEXT: vpmovqw %zmm1, %xmm1
1705 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
1706 ; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1707 ; AVX512F-NEXT: vzeroupper
1708 ; AVX512F-NEXT: retq
1710 ; AVX512BW-LABEL: trunc_mul_v8i64_v8i16:
1711 ; AVX512BW: # %bb.0:
1712 ; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1
1713 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
1714 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1715 ; AVX512BW-NEXT: vzeroupper
1716 ; AVX512BW-NEXT: retq
1718 ; AVX512DQ-LABEL: trunc_mul_v8i64_v8i16:
1719 ; AVX512DQ: # %bb.0:
1720 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
1721 ; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0
1722 ; AVX512DQ-NEXT: vzeroupper
1723 ; AVX512DQ-NEXT: retq
1724 %1 = mul <8 x i64> %a0, %a1
1725 %2 = trunc <8 x i64> %1 to <8 x i16>
1729 define <8 x i16> @trunc_mul_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
1730 ; SSE-LABEL: trunc_mul_v8i32_v8i16:
1732 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
1733 ; SSE-NEXT: pmuludq %xmm2, %xmm0
1734 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1735 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1736 ; SSE-NEXT: pmuludq %xmm4, %xmm2
1737 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1738 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1739 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1740 ; SSE-NEXT: pmuludq %xmm3, %xmm1
1741 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1742 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1743 ; SSE-NEXT: pmuludq %xmm2, %xmm3
1744 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
1745 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1746 ; SSE-NEXT: pslld $16, %xmm1
1747 ; SSE-NEXT: psrad $16, %xmm1
1748 ; SSE-NEXT: pslld $16, %xmm0
1749 ; SSE-NEXT: psrad $16, %xmm0
1750 ; SSE-NEXT: packssdw %xmm1, %xmm0
1753 ; AVX1-LABEL: trunc_mul_v8i32_v8i16:
1755 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm2
1756 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1757 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1758 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1759 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1760 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1761 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
1762 ; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
1763 ; AVX1-NEXT: vzeroupper
1766 ; AVX2-LABEL: trunc_mul_v8i32_v8i16:
1768 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
1769 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
1770 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1771 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1772 ; AVX2-NEXT: vzeroupper
1775 ; AVX512-LABEL: trunc_mul_v8i32_v8i16:
1777 ; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0
1778 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
1779 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1780 ; AVX512-NEXT: vzeroupper
1782 %1 = mul <8 x i32> %a0, %a1
1783 %2 = trunc <8 x i32> %1 to <8 x i16>
1787 define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
1788 ; SSE-LABEL: trunc_mul_v16i64_v16i8:
1790 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm0
1791 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm1
1792 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm2
1793 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm3
1794 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm4
1795 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm5
1796 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm6
1797 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm7
1798 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1799 ; SSE-NEXT: pand %xmm8, %xmm7
1800 ; SSE-NEXT: pand %xmm8, %xmm6
1801 ; SSE-NEXT: packuswb %xmm7, %xmm6
1802 ; SSE-NEXT: pand %xmm8, %xmm5
1803 ; SSE-NEXT: pand %xmm8, %xmm4
1804 ; SSE-NEXT: packuswb %xmm5, %xmm4
1805 ; SSE-NEXT: packuswb %xmm6, %xmm4
1806 ; SSE-NEXT: pand %xmm8, %xmm3
1807 ; SSE-NEXT: pand %xmm8, %xmm2
1808 ; SSE-NEXT: packuswb %xmm3, %xmm2
1809 ; SSE-NEXT: pand %xmm8, %xmm1
1810 ; SSE-NEXT: pand %xmm8, %xmm0
1811 ; SSE-NEXT: packuswb %xmm1, %xmm0
1812 ; SSE-NEXT: packuswb %xmm2, %xmm0
1813 ; SSE-NEXT: packuswb %xmm4, %xmm0
1816 ; AVX1-LABEL: trunc_mul_v16i64_v16i8:
1818 ; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm8
1819 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
1820 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1821 ; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm0
1822 ; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm4
1823 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
1824 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1825 ; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm1
1826 ; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm5
1827 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6
1828 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
1829 ; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm2
1830 ; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm6
1831 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7
1832 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
1833 ; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm3
1834 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm7 = [255,255]
1835 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
1836 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
1837 ; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3
1838 ; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
1839 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
1840 ; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2
1841 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1842 ; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1
1843 ; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3
1844 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
1845 ; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0
1846 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3
1847 ; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0
1848 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1849 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1850 ; AVX1-NEXT: vzeroupper
1853 ; AVX2-LABEL: trunc_mul_v16i64_v16i8:
1855 ; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm0
1856 ; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm1
1857 ; AVX2-NEXT: vpmuludq %ymm6, %ymm2, %ymm2
1858 ; AVX2-NEXT: vpmuludq %ymm7, %ymm3, %ymm3
1859 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1860 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
1861 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
1862 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
1863 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
1864 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
1865 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
1866 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
1867 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1868 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
1869 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1870 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1871 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1872 ; AVX2-NEXT: vzeroupper
1875 ; AVX512F-LABEL: trunc_mul_v16i64_v16i8:
1877 ; AVX512F-NEXT: vpmuludq %zmm2, %zmm0, %zmm0
1878 ; AVX512F-NEXT: vpmuludq %zmm3, %zmm1, %zmm1
1879 ; AVX512F-NEXT: vpmovqb %zmm1, %xmm1
1880 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
1881 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1882 ; AVX512F-NEXT: vzeroupper
1883 ; AVX512F-NEXT: retq
1885 ; AVX512BW-LABEL: trunc_mul_v16i64_v16i8:
1886 ; AVX512BW: # %bb.0:
1887 ; AVX512BW-NEXT: vpmuludq %zmm2, %zmm0, %zmm0
1888 ; AVX512BW-NEXT: vpmuludq %zmm3, %zmm1, %zmm1
1889 ; AVX512BW-NEXT: vpmovqb %zmm1, %xmm1
1890 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
1891 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1892 ; AVX512BW-NEXT: vzeroupper
1893 ; AVX512BW-NEXT: retq
1895 ; AVX512DQ-LABEL: trunc_mul_v16i64_v16i8:
1896 ; AVX512DQ: # %bb.0:
1897 ; AVX512DQ-NEXT: vpmullq %zmm2, %zmm0, %zmm0
1898 ; AVX512DQ-NEXT: vpmullq %zmm3, %zmm1, %zmm1
1899 ; AVX512DQ-NEXT: vpmovqb %zmm1, %xmm1
1900 ; AVX512DQ-NEXT: vpmovqb %zmm0, %xmm0
1901 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1902 ; AVX512DQ-NEXT: vzeroupper
1903 ; AVX512DQ-NEXT: retq
1904 %1 = mul <16 x i64> %a0, %a1
1905 %2 = trunc <16 x i64> %1 to <16 x i8>
1909 define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
1910 ; SSE-LABEL: trunc_mul_v16i32_v16i8:
1912 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
1913 ; SSE-NEXT: pmuludq %xmm4, %xmm0
1914 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1915 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
1916 ; SSE-NEXT: pmuludq %xmm8, %xmm4
1917 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1918 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
1919 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
1920 ; SSE-NEXT: pmuludq %xmm5, %xmm1
1921 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1922 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
1923 ; SSE-NEXT: pmuludq %xmm4, %xmm5
1924 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
1925 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
1926 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
1927 ; SSE-NEXT: pmuludq %xmm6, %xmm2
1928 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1929 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
1930 ; SSE-NEXT: pmuludq %xmm4, %xmm5
1931 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
1932 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
1933 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
1934 ; SSE-NEXT: pmuludq %xmm7, %xmm3
1935 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
1936 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
1937 ; SSE-NEXT: pmuludq %xmm4, %xmm5
1938 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
1939 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
1940 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1941 ; SSE-NEXT: pand %xmm4, %xmm3
1942 ; SSE-NEXT: pand %xmm4, %xmm2
1943 ; SSE-NEXT: packuswb %xmm3, %xmm2
1944 ; SSE-NEXT: pand %xmm4, %xmm1
1945 ; SSE-NEXT: pand %xmm4, %xmm0
1946 ; SSE-NEXT: packuswb %xmm1, %xmm0
1947 ; SSE-NEXT: packuswb %xmm2, %xmm0
1950 ; AVX1-LABEL: trunc_mul_v16i32_v16i8:
1952 ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm4
1953 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
1954 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1955 ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
1956 ; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm2
1957 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
1958 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1959 ; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1
1960 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255]
1961 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
1962 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
1963 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
1964 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1965 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2
1966 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
1967 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1968 ; AVX1-NEXT: vzeroupper
1971 ; AVX2-LABEL: trunc_mul_v16i32_v16i8:
1973 ; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0
1974 ; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1
1975 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1976 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
1977 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
1978 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
1979 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1980 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1981 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1982 ; AVX2-NEXT: vzeroupper
1985 ; AVX512-LABEL: trunc_mul_v16i32_v16i8:
1987 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
1988 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
1989 ; AVX512-NEXT: vzeroupper
1991 %1 = mul <16 x i32> %a0, %a1
1992 %2 = trunc <16 x i32> %1 to <16 x i8>
1996 define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
1997 ; SSE-LABEL: trunc_mul_v16i16_v16i8:
1999 ; SSE-NEXT: pmullw %xmm2, %xmm0
2000 ; SSE-NEXT: pmullw %xmm3, %xmm1
2001 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
2002 ; SSE-NEXT: pand %xmm2, %xmm1
2003 ; SSE-NEXT: pand %xmm2, %xmm0
2004 ; SSE-NEXT: packuswb %xmm1, %xmm0
2007 ; AVX1-LABEL: trunc_mul_v16i16_v16i8:
2009 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm2
2010 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2011 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2012 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
2013 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
2014 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
2015 ; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1
2016 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
2017 ; AVX1-NEXT: vzeroupper
2020 ; AVX2-LABEL: trunc_mul_v16i16_v16i8:
2022 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
2023 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2024 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2025 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2026 ; AVX2-NEXT: vzeroupper
2029 ; AVX512F-LABEL: trunc_mul_v16i16_v16i8:
2031 ; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0
2032 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2033 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
2034 ; AVX512F-NEXT: vzeroupper
2035 ; AVX512F-NEXT: retq
2037 ; AVX512BW-LABEL: trunc_mul_v16i16_v16i8:
2038 ; AVX512BW: # %bb.0:
2039 ; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0
2040 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
2041 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2042 ; AVX512BW-NEXT: vzeroupper
2043 ; AVX512BW-NEXT: retq
2045 ; AVX512DQ-LABEL: trunc_mul_v16i16_v16i8:
2046 ; AVX512DQ: # %bb.0:
2047 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
2048 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2049 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
2050 ; AVX512DQ-NEXT: vzeroupper
2051 ; AVX512DQ-NEXT: retq
2052 %1 = mul <16 x i16> %a0, %a1
2053 %2 = trunc <16 x i16> %1 to <16 x i8>
2057 define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
2058 ; SSE-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2060 ; SSE-NEXT: pxor %xmm3, %xmm3
2061 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2062 ; SSE-NEXT: pslld $16, %xmm2
2063 ; SSE-NEXT: psrad $16, %xmm2
2064 ; SSE-NEXT: pslld $16, %xmm1
2065 ; SSE-NEXT: psrad $16, %xmm1
2066 ; SSE-NEXT: packssdw %xmm2, %xmm1
2067 ; SSE-NEXT: pmullw %xmm1, %xmm0
2070 ; AVX1-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2072 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2073 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2074 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
2075 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2076 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
2077 ; AVX1-NEXT: vzeroupper
2080 ; AVX2-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2082 ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2083 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
2084 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2085 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
2086 ; AVX2-NEXT: vzeroupper
2089 ; AVX512-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2091 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
2092 ; AVX512-NEXT: vpmovdw %zmm1, %ymm1
2093 ; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2094 ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
2095 ; AVX512-NEXT: vzeroupper
2097 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2098 %2 = zext <8 x i8> %1 to <8 x i32>
2099 %3 = mul <8 x i32> %2, %a1
2100 %4 = trunc <8 x i32> %3 to <8 x i16>
2108 define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
2109 ; SSE-LABEL: trunc_mul_const_v4i64_v4i32:
2111 ; SSE-NEXT: xorps %xmm2, %xmm2
2112 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
2113 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2114 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
2115 ; SSE-NEXT: movaps %xmm2, %xmm0
2118 ; AVX1-LABEL: trunc_mul_const_v4i64_v4i32:
2120 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2121 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2122 ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2123 ; AVX1-NEXT: vzeroupper
2126 ; AVX2-SLOW-LABEL: trunc_mul_const_v4i64_v4i32:
2127 ; AVX2-SLOW: # %bb.0:
2128 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
2129 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2130 ; AVX2-SLOW-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2131 ; AVX2-SLOW-NEXT: vzeroupper
2132 ; AVX2-SLOW-NEXT: retq
2134 ; AVX2-FAST-ALL-LABEL: trunc_mul_const_v4i64_v4i32:
2135 ; AVX2-FAST-ALL: # %bb.0:
2136 ; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0]
2137 ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
2138 ; AVX2-FAST-ALL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2139 ; AVX2-FAST-ALL-NEXT: vzeroupper
2140 ; AVX2-FAST-ALL-NEXT: retq
2142 ; AVX2-FAST-PERLANE-LABEL: trunc_mul_const_v4i64_v4i32:
2143 ; AVX2-FAST-PERLANE: # %bb.0:
2144 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
2145 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2146 ; AVX2-FAST-PERLANE-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2147 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
2148 ; AVX2-FAST-PERLANE-NEXT: retq
2150 ; AVX512-LABEL: trunc_mul_const_v4i64_v4i32:
2152 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
2153 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
2154 ; AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2155 ; AVX512-NEXT: vzeroupper
2157 %1 = mul <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
2158 %2 = trunc <4 x i64> %1 to <4 x i32>
2162 define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
2163 ; SSE-LABEL: trunc_mul_const_v8i64_v8i16:
2165 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
2166 ; SSE-NEXT: pslld $16, %xmm2
2167 ; SSE-NEXT: psrad $16, %xmm2
2168 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2169 ; SSE-NEXT: pslld $16, %xmm0
2170 ; SSE-NEXT: psrad $16, %xmm0
2171 ; SSE-NEXT: packssdw %xmm2, %xmm0
2172 ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,1,2,3,4,5,6,7]
2175 ; AVX1-LABEL: trunc_mul_const_v8i64_v8i16:
2177 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
2178 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
2179 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
2180 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
2181 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
2182 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2183 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
2184 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2185 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,1,2,3,4,5,6,7]
2186 ; AVX1-NEXT: vzeroupper
2189 ; AVX2-LABEL: trunc_mul_const_v8i64_v8i16:
2191 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
2192 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
2193 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
2194 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
2195 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2196 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2197 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2198 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,1,2,3,4,5,6,7]
2199 ; AVX2-NEXT: vzeroupper
2202 ; AVX512-LABEL: trunc_mul_const_v8i64_v8i16:
2204 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
2205 ; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,1,2,3,4,5,6,7]
2206 ; AVX512-NEXT: vzeroupper
2208 %1 = mul <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
2209 %2 = trunc <8 x i64> %1 to <8 x i16>
2213 define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
2214 ; SSE-LABEL: trunc_mul_const_v8i32_v8i16:
2216 ; SSE-NEXT: pslld $16, %xmm1
2217 ; SSE-NEXT: psrad $16, %xmm1
2218 ; SSE-NEXT: pslld $16, %xmm0
2219 ; SSE-NEXT: psrad $16, %xmm0
2220 ; SSE-NEXT: packssdw %xmm1, %xmm0
2221 ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,1,2,3,4,5,6,7]
2224 ; AVX1-LABEL: trunc_mul_const_v8i32_v8i16:
2226 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2227 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2228 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2229 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,1,2,3,4,5,6,7]
2230 ; AVX1-NEXT: vzeroupper
2233 ; AVX2-LABEL: trunc_mul_const_v8i32_v8i16:
2235 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
2236 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2237 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,1,2,3,4,5,6,7]
2238 ; AVX2-NEXT: vzeroupper
2241 ; AVX512-LABEL: trunc_mul_const_v8i32_v8i16:
2243 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
2244 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
2245 ; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,1,2,3,4,5,6,7]
2246 ; AVX512-NEXT: vzeroupper
2248 %1 = mul <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2249 %2 = trunc <8 x i32> %1 to <8 x i16>
2253 define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
2254 ; SSE-LABEL: trunc_mul_const_v16i64_v16i8:
2256 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2257 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
2258 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
2259 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
2260 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
2261 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
2262 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7
2263 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
2264 ; SSE-NEXT: pand %xmm8, %xmm7
2265 ; SSE-NEXT: pand %xmm8, %xmm6
2266 ; SSE-NEXT: packuswb %xmm7, %xmm6
2267 ; SSE-NEXT: pand %xmm8, %xmm5
2268 ; SSE-NEXT: pand %xmm8, %xmm4
2269 ; SSE-NEXT: packuswb %xmm5, %xmm4
2270 ; SSE-NEXT: packuswb %xmm6, %xmm4
2271 ; SSE-NEXT: pand %xmm8, %xmm3
2272 ; SSE-NEXT: pand %xmm8, %xmm2
2273 ; SSE-NEXT: packuswb %xmm3, %xmm2
2274 ; SSE-NEXT: pand %xmm8, %xmm1
2275 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2276 ; SSE-NEXT: packuswb %xmm1, %xmm0
2277 ; SSE-NEXT: packuswb %xmm2, %xmm0
2278 ; SSE-NEXT: packuswb %xmm4, %xmm0
2281 ; AVX1-LABEL: trunc_mul_const_v16i64_v16i8:
2283 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4
2284 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2285 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2286 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm5
2287 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2288 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2289 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm6
2290 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
2291 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2292 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm7
2293 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
2294 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
2295 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm8 = [255,255]
2296 ; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3
2297 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7
2298 ; AVX1-NEXT: vpackusdw %xmm3, %xmm7, %xmm3
2299 ; AVX1-NEXT: vpand %xmm2, %xmm8, %xmm2
2300 ; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm6
2301 ; AVX1-NEXT: vpackusdw %xmm2, %xmm6, %xmm2
2302 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
2303 ; AVX1-NEXT: vpand %xmm1, %xmm8, %xmm1
2304 ; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm3
2305 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
2306 ; AVX1-NEXT: vpand %xmm0, %xmm8, %xmm0
2307 ; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3
2308 ; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0
2309 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2310 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2311 ; AVX1-NEXT: vzeroupper
2314 ; AVX2-LABEL: trunc_mul_const_v16i64_v16i8:
2316 ; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2317 ; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2318 ; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
2319 ; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
2320 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
2321 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
2322 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
2323 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
2324 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
2325 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
2326 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
2327 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
2328 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
2329 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
2330 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2331 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2332 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2333 ; AVX2-NEXT: vzeroupper
2336 ; AVX512F-LABEL: trunc_mul_const_v16i64_v16i8:
2338 ; AVX512F-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2339 ; AVX512F-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2340 ; AVX512F-NEXT: vpmovqb %zmm1, %xmm1
2341 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
2342 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2343 ; AVX512F-NEXT: vzeroupper
2344 ; AVX512F-NEXT: retq
2346 ; AVX512BW-LABEL: trunc_mul_const_v16i64_v16i8:
2347 ; AVX512BW: # %bb.0:
2348 ; AVX512BW-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2349 ; AVX512BW-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2350 ; AVX512BW-NEXT: vpmovqb %zmm1, %xmm1
2351 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
2352 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2353 ; AVX512BW-NEXT: vzeroupper
2354 ; AVX512BW-NEXT: retq
2356 ; AVX512DQ-LABEL: trunc_mul_const_v16i64_v16i8:
2357 ; AVX512DQ: # %bb.0:
2358 ; AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2359 ; AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2360 ; AVX512DQ-NEXT: vpmovqb %zmm1, %xmm1
2361 ; AVX512DQ-NEXT: vpmovqb %zmm0, %xmm0
2362 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2363 ; AVX512DQ-NEXT: vzeroupper
2364 ; AVX512DQ-NEXT: retq
2365 %1 = mul <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
2366 %2 = trunc <16 x i64> %1 to <16 x i8>
2370 define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
2371 ; SSE-LABEL: trunc_mul_const_v16i32_v16i8:
2373 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
2374 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2375 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2376 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
2377 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2378 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
2379 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
2380 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2381 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2382 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
2383 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2384 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
2385 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
2386 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
2387 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2388 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
2389 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2390 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
2391 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
2392 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
2393 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
2394 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
2395 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2396 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
2397 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
2398 ; SSE-NEXT: pand %xmm4, %xmm3
2399 ; SSE-NEXT: pand %xmm4, %xmm2
2400 ; SSE-NEXT: packuswb %xmm3, %xmm2
2401 ; SSE-NEXT: pand %xmm4, %xmm1
2402 ; SSE-NEXT: pand %xmm4, %xmm0
2403 ; SSE-NEXT: packuswb %xmm1, %xmm0
2404 ; SSE-NEXT: packuswb %xmm2, %xmm0
2407 ; AVX1-LABEL: trunc_mul_const_v16i32_v16i8:
2409 ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
2410 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2411 ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2412 ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
2413 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2414 ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2415 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255]
2416 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
2417 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
2418 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
2419 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
2420 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
2421 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
2422 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2423 ; AVX1-NEXT: vzeroupper
2426 ; AVX2-LABEL: trunc_mul_const_v16i32_v16i8:
2428 ; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2429 ; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2430 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
2431 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
2432 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
2433 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
2434 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2435 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2436 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2437 ; AVX2-NEXT: vzeroupper
2440 ; AVX512-LABEL: trunc_mul_const_v16i32_v16i8:
2442 ; AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2443 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
2444 ; AVX512-NEXT: vzeroupper
2446 %1 = mul <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2447 %2 = trunc <16 x i32> %1 to <16 x i8>
2451 define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
2452 ; SSE-LABEL: trunc_mul_const_v16i16_v16i8:
2454 ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,1,2,3,4,5,6,7]
2455 ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [8,9,10,11,12,13,14,15]
2456 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
2457 ; SSE-NEXT: pand %xmm2, %xmm1
2458 ; SSE-NEXT: pand %xmm2, %xmm0
2459 ; SSE-NEXT: packuswb %xmm1, %xmm0
2462 ; AVX1-LABEL: trunc_mul_const_v16i16_v16i8:
2464 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [0,1,2,3,4,5,6,7]
2465 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2466 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [8,9,10,11,12,13,14,15]
2467 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
2468 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
2469 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
2470 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
2471 ; AVX1-NEXT: vzeroupper
2474 ; AVX2-LABEL: trunc_mul_const_v16i16_v16i8:
2476 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
2477 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2478 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2479 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2480 ; AVX2-NEXT: vzeroupper
2483 ; AVX512F-LABEL: trunc_mul_const_v16i16_v16i8:
2485 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
2486 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2487 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
2488 ; AVX512F-NEXT: vzeroupper
2489 ; AVX512F-NEXT: retq
2491 ; AVX512BW-LABEL: trunc_mul_const_v16i16_v16i8:
2492 ; AVX512BW: # %bb.0:
2493 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
2494 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
2495 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2496 ; AVX512BW-NEXT: vzeroupper
2497 ; AVX512BW-NEXT: retq
2499 ; AVX512DQ-LABEL: trunc_mul_const_v16i16_v16i8:
2500 ; AVX512DQ: # %bb.0:
2501 ; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
2502 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2503 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
2504 ; AVX512DQ-NEXT: vzeroupper
2505 ; AVX512DQ-NEXT: retq
2506 %1 = mul <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
2507 %2 = trunc <16 x i16> %1 to <16 x i8>
2515 define <4 x i32> @trunc_and_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2516 ; SSE-LABEL: trunc_and_v4i64_v4i32:
2518 ; SSE-NEXT: andps %xmm3, %xmm1
2519 ; SSE-NEXT: andps %xmm2, %xmm0
2520 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2523 ; AVX1-LABEL: trunc_and_v4i64_v4i32:
2525 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
2526 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2527 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2528 ; AVX1-NEXT: vzeroupper
2531 ; AVX2-SLOW-LABEL: trunc_and_v4i64_v4i32:
2532 ; AVX2-SLOW: # %bb.0:
2533 ; AVX2-SLOW-NEXT: vandps %ymm1, %ymm0, %ymm0
2534 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
2535 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2536 ; AVX2-SLOW-NEXT: vzeroupper
2537 ; AVX2-SLOW-NEXT: retq
2539 ; AVX2-FAST-ALL-LABEL: trunc_and_v4i64_v4i32:
2540 ; AVX2-FAST-ALL: # %bb.0:
2541 ; AVX2-FAST-ALL-NEXT: vandps %ymm1, %ymm0, %ymm0
2542 ; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
2543 ; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
2544 ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
2545 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2546 ; AVX2-FAST-ALL-NEXT: vzeroupper
2547 ; AVX2-FAST-ALL-NEXT: retq
2549 ; AVX2-FAST-PERLANE-LABEL: trunc_and_v4i64_v4i32:
2550 ; AVX2-FAST-PERLANE: # %bb.0:
2551 ; AVX2-FAST-PERLANE-NEXT: vandps %ymm1, %ymm0, %ymm0
2552 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
2553 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2554 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
2555 ; AVX2-FAST-PERLANE-NEXT: retq
2557 ; AVX512-LABEL: trunc_and_v4i64_v4i32:
2559 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
2560 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
2561 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2562 ; AVX512-NEXT: vzeroupper
2564 %1 = and <4 x i64> %a0, %a1
2565 %2 = trunc <4 x i64> %1 to <4 x i32>
2569 define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
2570 ; SSE-LABEL: trunc_and_v8i64_v8i16:
2572 ; SSE-NEXT: andps %xmm5, %xmm1
2573 ; SSE-NEXT: andps %xmm4, %xmm0
2574 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2575 ; SSE-NEXT: andps %xmm7, %xmm3
2576 ; SSE-NEXT: andps %xmm6, %xmm2
2577 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
2578 ; SSE-NEXT: pslld $16, %xmm2
2579 ; SSE-NEXT: psrad $16, %xmm2
2580 ; SSE-NEXT: pslld $16, %xmm0
2581 ; SSE-NEXT: psrad $16, %xmm0
2582 ; SSE-NEXT: packssdw %xmm2, %xmm0
2585 ; AVX1-LABEL: trunc_and_v8i64_v8i16:
2587 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
2588 ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
2589 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
2590 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
2591 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
2592 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
2593 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
2594 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2595 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
2596 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2597 ; AVX1-NEXT: vzeroupper
2600 ; AVX2-LABEL: trunc_and_v8i64_v8i16:
2602 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
2603 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
2604 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
2605 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
2606 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
2607 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
2608 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2609 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2610 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2611 ; AVX2-NEXT: vzeroupper
2614 ; AVX512-LABEL: trunc_and_v8i64_v8i16:
2616 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
2617 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
2618 ; AVX512-NEXT: vzeroupper
2620 %1 = and <8 x i64> %a0, %a1
2621 %2 = trunc <8 x i64> %1 to <8 x i16>
2625 define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
2626 ; SSE-LABEL: trunc_and_v8i32_v8i16:
2628 ; SSE-NEXT: pand %xmm2, %xmm0
2629 ; SSE-NEXT: pand %xmm3, %xmm1
2630 ; SSE-NEXT: pslld $16, %xmm1
2631 ; SSE-NEXT: psrad $16, %xmm1
2632 ; SSE-NEXT: pslld $16, %xmm0
2633 ; SSE-NEXT: psrad $16, %xmm0
2634 ; SSE-NEXT: packssdw %xmm1, %xmm0
2637 ; AVX1-LABEL: trunc_and_v8i32_v8i16:
2639 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
2640 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2641 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2642 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2643 ; AVX1-NEXT: vzeroupper
2646 ; AVX2-LABEL: trunc_and_v8i32_v8i16:
2648 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
2649 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
2650 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2651 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2652 ; AVX2-NEXT: vzeroupper
2655 ; AVX512-LABEL: trunc_and_v8i32_v8i16:
2657 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
2658 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
2659 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2660 ; AVX512-NEXT: vzeroupper
2662 %1 = and <8 x i32> %a0, %a1
2663 %2 = trunc <8 x i32> %1 to <8 x i16>
2667 define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
2668 ; SSE-LABEL: trunc_and_v16i64_v16i8:
2670 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm0
2671 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm1
2672 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm2
2673 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm3
2674 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm4
2675 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm5
2676 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm6
2677 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm7
2678 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
2679 ; SSE-NEXT: pand %xmm8, %xmm7
2680 ; SSE-NEXT: pand %xmm8, %xmm6
2681 ; SSE-NEXT: packuswb %xmm7, %xmm6
2682 ; SSE-NEXT: pand %xmm8, %xmm5
2683 ; SSE-NEXT: pand %xmm8, %xmm4
2684 ; SSE-NEXT: packuswb %xmm5, %xmm4
2685 ; SSE-NEXT: packuswb %xmm6, %xmm4
2686 ; SSE-NEXT: pand %xmm8, %xmm3
2687 ; SSE-NEXT: pand %xmm8, %xmm2
2688 ; SSE-NEXT: packuswb %xmm3, %xmm2
2689 ; SSE-NEXT: pand %xmm8, %xmm1
2690 ; SSE-NEXT: pand %xmm8, %xmm0
2691 ; SSE-NEXT: packuswb %xmm1, %xmm0
2692 ; SSE-NEXT: packuswb %xmm2, %xmm0
2693 ; SSE-NEXT: packuswb %xmm4, %xmm0
2696 ; AVX1-LABEL: trunc_and_v16i64_v16i8:
2698 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
2699 ; AVX1-NEXT: vandps %ymm5, %ymm1, %ymm1
2700 ; AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2
2701 ; AVX1-NEXT: vandps %ymm7, %ymm3, %ymm3
2702 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255]
2703 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
2704 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
2705 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
2706 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
2707 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
2708 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
2709 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
2710 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
2711 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
2712 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
2713 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
2714 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
2715 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
2716 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2717 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2718 ; AVX1-NEXT: vzeroupper
2721 ; AVX2-LABEL: trunc_and_v16i64_v16i8:
2723 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
2724 ; AVX2-NEXT: vpand %ymm5, %ymm1, %ymm1
2725 ; AVX2-NEXT: vpand %ymm6, %ymm2, %ymm2
2726 ; AVX2-NEXT: vpand %ymm7, %ymm3, %ymm3
2727 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
2728 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
2729 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
2730 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
2731 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
2732 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
2733 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
2734 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
2735 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
2736 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
2737 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2738 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2739 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2740 ; AVX2-NEXT: vzeroupper
2743 ; AVX512-LABEL: trunc_and_v16i64_v16i8:
2745 ; AVX512-NEXT: vpandq %zmm2, %zmm0, %zmm0
2746 ; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1
2747 ; AVX512-NEXT: vpmovqb %zmm1, %xmm1
2748 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
2749 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2750 ; AVX512-NEXT: vzeroupper
2752 %1 = and <16 x i64> %a0, %a1
2753 %2 = trunc <16 x i64> %1 to <16 x i8>
2757 define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
2758 ; SSE-LABEL: trunc_and_v16i32_v16i8:
2760 ; SSE-NEXT: pand %xmm4, %xmm0
2761 ; SSE-NEXT: pand %xmm5, %xmm1
2762 ; SSE-NEXT: pand %xmm6, %xmm2
2763 ; SSE-NEXT: pand %xmm7, %xmm3
2764 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
2765 ; SSE-NEXT: pand %xmm4, %xmm3
2766 ; SSE-NEXT: pand %xmm4, %xmm2
2767 ; SSE-NEXT: packuswb %xmm3, %xmm2
2768 ; SSE-NEXT: pand %xmm4, %xmm1
2769 ; SSE-NEXT: pand %xmm4, %xmm0
2770 ; SSE-NEXT: packuswb %xmm1, %xmm0
2771 ; SSE-NEXT: packuswb %xmm2, %xmm0
2774 ; AVX1-LABEL: trunc_and_v16i32_v16i8:
2776 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
2777 ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
2778 ; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
2779 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
2780 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
2781 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
2782 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
2783 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2784 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
2785 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2786 ; AVX1-NEXT: vzeroupper
2789 ; AVX2-LABEL: trunc_and_v16i32_v16i8:
2791 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
2792 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
2793 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
2794 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
2795 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
2796 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
2797 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2798 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2799 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2800 ; AVX2-NEXT: vzeroupper
2803 ; AVX512-LABEL: trunc_and_v16i32_v16i8:
2805 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
2806 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
2807 ; AVX512-NEXT: vzeroupper
2809 %1 = and <16 x i32> %a0, %a1
2810 %2 = trunc <16 x i32> %1 to <16 x i8>
2814 define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
2815 ; SSE-LABEL: trunc_and_v16i16_v16i8:
2817 ; SSE-NEXT: pand %xmm2, %xmm0
2818 ; SSE-NEXT: pand %xmm3, %xmm1
2819 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
2820 ; SSE-NEXT: pand %xmm2, %xmm1
2821 ; SSE-NEXT: pand %xmm2, %xmm0
2822 ; SSE-NEXT: packuswb %xmm1, %xmm0
2825 ; AVX1-LABEL: trunc_and_v16i16_v16i8:
2827 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
2828 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2829 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2830 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2831 ; AVX1-NEXT: vzeroupper
2834 ; AVX2-LABEL: trunc_and_v16i16_v16i8:
2836 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
2837 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2838 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2839 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2840 ; AVX2-NEXT: vzeroupper
2843 ; AVX512F-LABEL: trunc_and_v16i16_v16i8:
2845 ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
2846 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2847 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
2848 ; AVX512F-NEXT: vzeroupper
2849 ; AVX512F-NEXT: retq
2851 ; AVX512BW-LABEL: trunc_and_v16i16_v16i8:
2852 ; AVX512BW: # %bb.0:
2853 ; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0
2854 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
2855 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2856 ; AVX512BW-NEXT: vzeroupper
2857 ; AVX512BW-NEXT: retq
2859 ; AVX512DQ-LABEL: trunc_and_v16i16_v16i8:
2860 ; AVX512DQ: # %bb.0:
2861 ; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0
2862 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2863 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
2864 ; AVX512DQ-NEXT: vzeroupper
2865 ; AVX512DQ-NEXT: retq
2866 %1 = and <16 x i16> %a0, %a1
2867 %2 = trunc <16 x i16> %1 to <16 x i8>
2875 define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
2876 ; SSE-LABEL: trunc_and_const_v4i64_v4i32:
2878 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2879 ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2882 ; AVX1-LABEL: trunc_and_const_v4i64_v4i32:
2884 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2885 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2886 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2887 ; AVX1-NEXT: vzeroupper
2890 ; AVX2-SLOW-LABEL: trunc_and_const_v4i64_v4i32:
2891 ; AVX2-SLOW: # %bb.0:
2892 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
2893 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2894 ; AVX2-SLOW-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2895 ; AVX2-SLOW-NEXT: vzeroupper
2896 ; AVX2-SLOW-NEXT: retq
2898 ; AVX2-FAST-ALL-LABEL: trunc_and_const_v4i64_v4i32:
2899 ; AVX2-FAST-ALL: # %bb.0:
2900 ; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
2901 ; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
2902 ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
2903 ; AVX2-FAST-ALL-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2904 ; AVX2-FAST-ALL-NEXT: vzeroupper
2905 ; AVX2-FAST-ALL-NEXT: retq
2907 ; AVX2-FAST-PERLANE-LABEL: trunc_and_const_v4i64_v4i32:
2908 ; AVX2-FAST-PERLANE: # %bb.0:
2909 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
2910 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2911 ; AVX2-FAST-PERLANE-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2912 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
2913 ; AVX2-FAST-PERLANE-NEXT: retq
2915 ; AVX512-LABEL: trunc_and_const_v4i64_v4i32:
2917 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
2918 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
2919 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2920 ; AVX512-NEXT: vzeroupper
2922 %1 = and <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
2923 %2 = trunc <4 x i64> %1 to <4 x i32>
2927 define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
2928 ; SSE-LABEL: trunc_and_const_v8i64_v8i16:
2930 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
2931 ; SSE-NEXT: pslld $16, %xmm2
2932 ; SSE-NEXT: psrad $16, %xmm2
2933 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2934 ; SSE-NEXT: pslld $16, %xmm0
2935 ; SSE-NEXT: psrad $16, %xmm0
2936 ; SSE-NEXT: packssdw %xmm2, %xmm0
2937 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2940 ; AVX1-LABEL: trunc_and_const_v8i64_v8i16:
2942 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
2943 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
2944 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
2945 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
2946 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
2947 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2948 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
2949 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2950 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2951 ; AVX1-NEXT: vzeroupper
2954 ; AVX2-LABEL: trunc_and_const_v8i64_v8i16:
2956 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
2957 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
2958 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
2959 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
2960 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2961 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2962 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2963 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2964 ; AVX2-NEXT: vzeroupper
2967 ; AVX512-LABEL: trunc_and_const_v8i64_v8i16:
2969 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
2970 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2971 ; AVX512-NEXT: vzeroupper
2973 %1 = and <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
2974 %2 = trunc <8 x i64> %1 to <8 x i16>
2978 define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
2979 ; SSE-LABEL: trunc_and_const_v8i32_v8i16:
2981 ; SSE-NEXT: pslld $16, %xmm1
2982 ; SSE-NEXT: psrad $16, %xmm1
2983 ; SSE-NEXT: pslld $16, %xmm0
2984 ; SSE-NEXT: psrad $16, %xmm0
2985 ; SSE-NEXT: packssdw %xmm1, %xmm0
2986 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2989 ; AVX1-LABEL: trunc_and_const_v8i32_v8i16:
2991 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2992 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2993 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2994 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2995 ; AVX1-NEXT: vzeroupper
2998 ; AVX2-LABEL: trunc_and_const_v8i32_v8i16:
3000 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
3001 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3002 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3003 ; AVX2-NEXT: vzeroupper
3006 ; AVX512-LABEL: trunc_and_const_v8i32_v8i16:
3008 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
3009 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
3010 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3011 ; AVX512-NEXT: vzeroupper
3013 %1 = and <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3014 %2 = trunc <8 x i32> %1 to <8 x i16>
3018 define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
3019 ; SSE-LABEL: trunc_and_const_v16i64_v16i8:
3021 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3022 ; SSE-NEXT: pand %xmm8, %xmm7
3023 ; SSE-NEXT: pand %xmm8, %xmm6
3024 ; SSE-NEXT: packuswb %xmm7, %xmm6
3025 ; SSE-NEXT: pand %xmm8, %xmm5
3026 ; SSE-NEXT: pand %xmm8, %xmm4
3027 ; SSE-NEXT: packuswb %xmm5, %xmm4
3028 ; SSE-NEXT: packuswb %xmm6, %xmm4
3029 ; SSE-NEXT: pand %xmm8, %xmm3
3030 ; SSE-NEXT: pand %xmm8, %xmm2
3031 ; SSE-NEXT: packuswb %xmm3, %xmm2
3032 ; SSE-NEXT: pand %xmm8, %xmm1
3033 ; SSE-NEXT: pand %xmm8, %xmm0
3034 ; SSE-NEXT: packuswb %xmm1, %xmm0
3035 ; SSE-NEXT: packuswb %xmm2, %xmm0
3036 ; SSE-NEXT: packuswb %xmm4, %xmm0
3037 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3040 ; AVX1-LABEL: trunc_and_const_v16i64_v16i8:
3042 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255]
3043 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
3044 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
3045 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
3046 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
3047 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
3048 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
3049 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
3050 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
3051 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3052 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3053 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
3054 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
3055 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
3056 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3057 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
3058 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3059 ; AVX1-NEXT: vzeroupper
3062 ; AVX2-LABEL: trunc_and_const_v16i64_v16i8:
3064 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3065 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
3066 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
3067 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
3068 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
3069 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
3070 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
3071 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
3072 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
3073 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
3074 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3075 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3076 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3077 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3078 ; AVX2-NEXT: vzeroupper
3081 ; AVX512-LABEL: trunc_and_const_v16i64_v16i8:
3083 ; AVX512-NEXT: vpmovqb %zmm1, %xmm1
3084 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
3085 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3086 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3087 ; AVX512-NEXT: vzeroupper
3089 %1 = and <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
3090 %2 = trunc <16 x i64> %1 to <16 x i8>
3094 define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
3095 ; SSE-LABEL: trunc_and_const_v16i32_v16i8:
3097 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3098 ; SSE-NEXT: pand %xmm4, %xmm3
3099 ; SSE-NEXT: pand %xmm4, %xmm2
3100 ; SSE-NEXT: packuswb %xmm3, %xmm2
3101 ; SSE-NEXT: pand %xmm4, %xmm1
3102 ; SSE-NEXT: pand %xmm4, %xmm0
3103 ; SSE-NEXT: packuswb %xmm1, %xmm0
3104 ; SSE-NEXT: packuswb %xmm2, %xmm0
3105 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3108 ; AVX1-LABEL: trunc_and_const_v16i32_v16i8:
3110 ; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
3111 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
3112 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3113 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3114 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
3115 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3116 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
3117 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3118 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3119 ; AVX1-NEXT: vzeroupper
3122 ; AVX2-LABEL: trunc_and_const_v16i32_v16i8:
3124 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3125 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
3126 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
3127 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
3128 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3129 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3130 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3131 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3132 ; AVX2-NEXT: vzeroupper
3135 ; AVX512-LABEL: trunc_and_const_v16i32_v16i8:
3137 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
3138 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3139 ; AVX512-NEXT: vzeroupper
3141 %1 = and <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3142 %2 = trunc <16 x i32> %1 to <16 x i8>
3146 define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
3147 ; SSE-LABEL: trunc_and_const_v16i16_v16i8:
3149 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
3150 ; SSE-NEXT: pand %xmm2, %xmm1
3151 ; SSE-NEXT: pand %xmm2, %xmm0
3152 ; SSE-NEXT: packuswb %xmm1, %xmm0
3153 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3156 ; AVX1-LABEL: trunc_and_const_v16i16_v16i8:
3158 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3159 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3160 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3161 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3162 ; AVX1-NEXT: vzeroupper
3165 ; AVX2-LABEL: trunc_and_const_v16i16_v16i8:
3167 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3168 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3169 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3170 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3171 ; AVX2-NEXT: vzeroupper
3174 ; AVX512F-LABEL: trunc_and_const_v16i16_v16i8:
3176 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3177 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
3178 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3179 ; AVX512F-NEXT: vzeroupper
3180 ; AVX512F-NEXT: retq
3182 ; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8:
3183 ; AVX512BW: # %bb.0:
3184 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
3185 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
3186 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3187 ; AVX512BW-NEXT: vzeroupper
3188 ; AVX512BW-NEXT: retq
3190 ; AVX512DQ-LABEL: trunc_and_const_v16i16_v16i8:
3191 ; AVX512DQ: # %bb.0:
3192 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3193 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
3194 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3195 ; AVX512DQ-NEXT: vzeroupper
3196 ; AVX512DQ-NEXT: retq
3197 %1 = and <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
3198 %2 = trunc <16 x i16> %1 to <16 x i8>
3206 define <4 x i32> @trunc_xor_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
3207 ; SSE-LABEL: trunc_xor_v4i64_v4i32:
3209 ; SSE-NEXT: xorps %xmm3, %xmm1
3210 ; SSE-NEXT: xorps %xmm2, %xmm0
3211 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3214 ; AVX1-LABEL: trunc_xor_v4i64_v4i32:
3216 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
3217 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3218 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3219 ; AVX1-NEXT: vzeroupper
3222 ; AVX2-SLOW-LABEL: trunc_xor_v4i64_v4i32:
3223 ; AVX2-SLOW: # %bb.0:
3224 ; AVX2-SLOW-NEXT: vxorps %ymm1, %ymm0, %ymm0
3225 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
3226 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3227 ; AVX2-SLOW-NEXT: vzeroupper
3228 ; AVX2-SLOW-NEXT: retq
3230 ; AVX2-FAST-ALL-LABEL: trunc_xor_v4i64_v4i32:
3231 ; AVX2-FAST-ALL: # %bb.0:
3232 ; AVX2-FAST-ALL-NEXT: vxorps %ymm1, %ymm0, %ymm0
3233 ; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
3234 ; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
3235 ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
3236 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3237 ; AVX2-FAST-ALL-NEXT: vzeroupper
3238 ; AVX2-FAST-ALL-NEXT: retq
3240 ; AVX2-FAST-PERLANE-LABEL: trunc_xor_v4i64_v4i32:
3241 ; AVX2-FAST-PERLANE: # %bb.0:
3242 ; AVX2-FAST-PERLANE-NEXT: vxorps %ymm1, %ymm0, %ymm0
3243 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
3244 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3245 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
3246 ; AVX2-FAST-PERLANE-NEXT: retq
3248 ; AVX512-LABEL: trunc_xor_v4i64_v4i32:
3250 ; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
3251 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
3252 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3253 ; AVX512-NEXT: vzeroupper
3255 %1 = xor <4 x i64> %a0, %a1
3256 %2 = trunc <4 x i64> %1 to <4 x i32>
3260 define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
3261 ; SSE-LABEL: trunc_xor_v8i64_v8i16:
3263 ; SSE-NEXT: xorps %xmm5, %xmm1
3264 ; SSE-NEXT: xorps %xmm4, %xmm0
3265 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3266 ; SSE-NEXT: xorps %xmm7, %xmm3
3267 ; SSE-NEXT: xorps %xmm6, %xmm2
3268 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
3269 ; SSE-NEXT: pslld $16, %xmm2
3270 ; SSE-NEXT: psrad $16, %xmm2
3271 ; SSE-NEXT: pslld $16, %xmm0
3272 ; SSE-NEXT: psrad $16, %xmm0
3273 ; SSE-NEXT: packssdw %xmm2, %xmm0
3276 ; AVX1-LABEL: trunc_xor_v8i64_v8i16:
3278 ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
3279 ; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1
3280 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
3281 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
3282 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3283 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3284 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
3285 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3286 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
3287 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3288 ; AVX1-NEXT: vzeroupper
3291 ; AVX2-LABEL: trunc_xor_v8i64_v8i16:
3293 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
3294 ; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
3295 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
3296 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
3297 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
3298 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
3299 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3300 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3301 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3302 ; AVX2-NEXT: vzeroupper
3305 ; AVX512-LABEL: trunc_xor_v8i64_v8i16:
3307 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
3308 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
3309 ; AVX512-NEXT: vzeroupper
3311 %1 = xor <8 x i64> %a0, %a1
3312 %2 = trunc <8 x i64> %1 to <8 x i16>
3316 define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
3317 ; SSE-LABEL: trunc_xor_v8i32_v8i16:
3319 ; SSE-NEXT: pxor %xmm2, %xmm0
3320 ; SSE-NEXT: pxor %xmm3, %xmm1
3321 ; SSE-NEXT: pslld $16, %xmm1
3322 ; SSE-NEXT: psrad $16, %xmm1
3323 ; SSE-NEXT: pslld $16, %xmm0
3324 ; SSE-NEXT: psrad $16, %xmm0
3325 ; SSE-NEXT: packssdw %xmm1, %xmm0
3328 ; AVX1-LABEL: trunc_xor_v8i32_v8i16:
3330 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
3331 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3332 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3333 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3334 ; AVX1-NEXT: vzeroupper
3337 ; AVX2-LABEL: trunc_xor_v8i32_v8i16:
3339 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
3340 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
3341 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3342 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3343 ; AVX2-NEXT: vzeroupper
3346 ; AVX512-LABEL: trunc_xor_v8i32_v8i16:
3348 ; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
3349 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
3350 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3351 ; AVX512-NEXT: vzeroupper
3353 %1 = xor <8 x i32> %a0, %a1
3354 %2 = trunc <8 x i32> %1 to <8 x i16>
3358 define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
3359 ; SSE-LABEL: trunc_xor_v16i64_v16i8:
3361 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm0
3362 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm1
3363 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm2
3364 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm3
3365 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm4
3366 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm5
3367 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm6
3368 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm7
3369 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3370 ; SSE-NEXT: pand %xmm8, %xmm7
3371 ; SSE-NEXT: pand %xmm8, %xmm6
3372 ; SSE-NEXT: packuswb %xmm7, %xmm6
3373 ; SSE-NEXT: pand %xmm8, %xmm5
3374 ; SSE-NEXT: pand %xmm8, %xmm4
3375 ; SSE-NEXT: packuswb %xmm5, %xmm4
3376 ; SSE-NEXT: packuswb %xmm6, %xmm4
3377 ; SSE-NEXT: pand %xmm8, %xmm3
3378 ; SSE-NEXT: pand %xmm8, %xmm2
3379 ; SSE-NEXT: packuswb %xmm3, %xmm2
3380 ; SSE-NEXT: pand %xmm8, %xmm1
3381 ; SSE-NEXT: pand %xmm8, %xmm0
3382 ; SSE-NEXT: packuswb %xmm1, %xmm0
3383 ; SSE-NEXT: packuswb %xmm2, %xmm0
3384 ; SSE-NEXT: packuswb %xmm4, %xmm0
3387 ; AVX1-LABEL: trunc_xor_v16i64_v16i8:
3389 ; AVX1-NEXT: vxorps %ymm4, %ymm0, %ymm0
3390 ; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1
3391 ; AVX1-NEXT: vxorps %ymm6, %ymm2, %ymm2
3392 ; AVX1-NEXT: vxorps %ymm7, %ymm3, %ymm3
3393 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255]
3394 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
3395 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
3396 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
3397 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
3398 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
3399 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
3400 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
3401 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
3402 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3403 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3404 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
3405 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
3406 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
3407 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3408 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
3409 ; AVX1-NEXT: vzeroupper
3412 ; AVX2-LABEL: trunc_xor_v16i64_v16i8:
3414 ; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
3415 ; AVX2-NEXT: vpxor %ymm5, %ymm1, %ymm1
3416 ; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2
3417 ; AVX2-NEXT: vpxor %ymm7, %ymm3, %ymm3
3418 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3419 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
3420 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
3421 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
3422 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
3423 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
3424 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
3425 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
3426 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
3427 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
3428 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3429 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3430 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3431 ; AVX2-NEXT: vzeroupper
3434 ; AVX512-LABEL: trunc_xor_v16i64_v16i8:
3436 ; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0
3437 ; AVX512-NEXT: vpxorq %zmm3, %zmm1, %zmm1
3438 ; AVX512-NEXT: vpmovqb %zmm1, %xmm1
3439 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
3440 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3441 ; AVX512-NEXT: vzeroupper
3443 %1 = xor <16 x i64> %a0, %a1
3444 %2 = trunc <16 x i64> %1 to <16 x i8>
3448 define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
3449 ; SSE-LABEL: trunc_xor_v16i32_v16i8:
3451 ; SSE-NEXT: pxor %xmm4, %xmm0
3452 ; SSE-NEXT: pxor %xmm5, %xmm1
3453 ; SSE-NEXT: pxor %xmm6, %xmm2
3454 ; SSE-NEXT: pxor %xmm7, %xmm3
3455 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3456 ; SSE-NEXT: pand %xmm4, %xmm3
3457 ; SSE-NEXT: pand %xmm4, %xmm2
3458 ; SSE-NEXT: packuswb %xmm3, %xmm2
3459 ; SSE-NEXT: pand %xmm4, %xmm1
3460 ; SSE-NEXT: pand %xmm4, %xmm0
3461 ; SSE-NEXT: packuswb %xmm1, %xmm0
3462 ; SSE-NEXT: packuswb %xmm2, %xmm0
3465 ; AVX1-LABEL: trunc_xor_v16i32_v16i8:
3467 ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
3468 ; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1
3469 ; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
3470 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
3471 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3472 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3473 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
3474 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3475 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
3476 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3477 ; AVX1-NEXT: vzeroupper
3480 ; AVX2-LABEL: trunc_xor_v16i32_v16i8:
3482 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
3483 ; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
3484 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3485 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
3486 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
3487 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
3488 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3489 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3490 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3491 ; AVX2-NEXT: vzeroupper
3494 ; AVX512-LABEL: trunc_xor_v16i32_v16i8:
3496 ; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0
3497 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
3498 ; AVX512-NEXT: vzeroupper
3500 %1 = xor <16 x i32> %a0, %a1
3501 %2 = trunc <16 x i32> %1 to <16 x i8>
3505 define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
3506 ; SSE-LABEL: trunc_xor_v16i16_v16i8:
3508 ; SSE-NEXT: pxor %xmm2, %xmm0
3509 ; SSE-NEXT: pxor %xmm3, %xmm1
3510 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
3511 ; SSE-NEXT: pand %xmm2, %xmm1
3512 ; SSE-NEXT: pand %xmm2, %xmm0
3513 ; SSE-NEXT: packuswb %xmm1, %xmm0
3516 ; AVX1-LABEL: trunc_xor_v16i16_v16i8:
3518 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
3519 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3520 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3521 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3522 ; AVX1-NEXT: vzeroupper
3525 ; AVX2-LABEL: trunc_xor_v16i16_v16i8:
3527 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
3528 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3529 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3530 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3531 ; AVX2-NEXT: vzeroupper
3534 ; AVX512F-LABEL: trunc_xor_v16i16_v16i8:
3536 ; AVX512F-NEXT: vpxor %ymm1, %ymm0, %ymm0
3537 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3538 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
3539 ; AVX512F-NEXT: vzeroupper
3540 ; AVX512F-NEXT: retq
3542 ; AVX512BW-LABEL: trunc_xor_v16i16_v16i8:
3543 ; AVX512BW: # %bb.0:
3544 ; AVX512BW-NEXT: vpxor %ymm1, %ymm0, %ymm0
3545 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
3546 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3547 ; AVX512BW-NEXT: vzeroupper
3548 ; AVX512BW-NEXT: retq
3550 ; AVX512DQ-LABEL: trunc_xor_v16i16_v16i8:
3551 ; AVX512DQ: # %bb.0:
3552 ; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0
3553 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3554 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
3555 ; AVX512DQ-NEXT: vzeroupper
3556 ; AVX512DQ-NEXT: retq
3557 %1 = xor <16 x i16> %a0, %a1
3558 %2 = trunc <16 x i16> %1 to <16 x i8>
3566 define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
3567 ; SSE-LABEL: trunc_xor_const_v4i64_v4i32:
3569 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3570 ; SSE-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3573 ; AVX1-LABEL: trunc_xor_const_v4i64_v4i32:
3575 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3576 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3577 ; AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3578 ; AVX1-NEXT: vzeroupper
3581 ; AVX2-SLOW-LABEL: trunc_xor_const_v4i64_v4i32:
3582 ; AVX2-SLOW: # %bb.0:
3583 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
3584 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3585 ; AVX2-SLOW-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3586 ; AVX2-SLOW-NEXT: vzeroupper
3587 ; AVX2-SLOW-NEXT: retq
3589 ; AVX2-FAST-ALL-LABEL: trunc_xor_const_v4i64_v4i32:
3590 ; AVX2-FAST-ALL: # %bb.0:
3591 ; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
3592 ; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
3593 ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
3594 ; AVX2-FAST-ALL-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3595 ; AVX2-FAST-ALL-NEXT: vzeroupper
3596 ; AVX2-FAST-ALL-NEXT: retq
3598 ; AVX2-FAST-PERLANE-LABEL: trunc_xor_const_v4i64_v4i32:
3599 ; AVX2-FAST-PERLANE: # %bb.0:
3600 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
3601 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3602 ; AVX2-FAST-PERLANE-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3603 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
3604 ; AVX2-FAST-PERLANE-NEXT: retq
3606 ; AVX512-LABEL: trunc_xor_const_v4i64_v4i32:
3608 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
3609 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
3610 ; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3611 ; AVX512-NEXT: vzeroupper
3613 %1 = xor <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
3614 %2 = trunc <4 x i64> %1 to <4 x i32>
3618 define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
3619 ; SSE-LABEL: trunc_xor_const_v8i64_v8i16:
3621 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
3622 ; SSE-NEXT: pslld $16, %xmm2
3623 ; SSE-NEXT: psrad $16, %xmm2
3624 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3625 ; SSE-NEXT: pslld $16, %xmm0
3626 ; SSE-NEXT: psrad $16, %xmm0
3627 ; SSE-NEXT: packssdw %xmm2, %xmm0
3628 ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3631 ; AVX1-LABEL: trunc_xor_const_v8i64_v8i16:
3633 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
3634 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
3635 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3636 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3637 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
3638 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3639 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
3640 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3641 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3642 ; AVX1-NEXT: vzeroupper
3645 ; AVX2-LABEL: trunc_xor_const_v8i64_v8i16:
3647 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
3648 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
3649 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
3650 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
3651 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3652 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3653 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3654 ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3655 ; AVX2-NEXT: vzeroupper
3658 ; AVX512-LABEL: trunc_xor_const_v8i64_v8i16:
3660 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
3661 ; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3662 ; AVX512-NEXT: vzeroupper
3664 %1 = xor <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
3665 %2 = trunc <8 x i64> %1 to <8 x i16>
3669 define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
3670 ; SSE-LABEL: trunc_xor_const_v8i32_v8i16:
3672 ; SSE-NEXT: pslld $16, %xmm1
3673 ; SSE-NEXT: psrad $16, %xmm1
3674 ; SSE-NEXT: pslld $16, %xmm0
3675 ; SSE-NEXT: psrad $16, %xmm0
3676 ; SSE-NEXT: packssdw %xmm1, %xmm0
3677 ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3680 ; AVX1-LABEL: trunc_xor_const_v8i32_v8i16:
3682 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3683 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3684 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3685 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3686 ; AVX1-NEXT: vzeroupper
3689 ; AVX2-LABEL: trunc_xor_const_v8i32_v8i16:
3691 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
3692 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3693 ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3694 ; AVX2-NEXT: vzeroupper
3697 ; AVX512-LABEL: trunc_xor_const_v8i32_v8i16:
3699 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
3700 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
3701 ; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3702 ; AVX512-NEXT: vzeroupper
3704 %1 = xor <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3705 %2 = trunc <8 x i32> %1 to <8 x i16>
3709 define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
3710 ; SSE-LABEL: trunc_xor_const_v16i64_v16i8:
3712 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3713 ; SSE-NEXT: pand %xmm8, %xmm7
3714 ; SSE-NEXT: pand %xmm8, %xmm6
3715 ; SSE-NEXT: packuswb %xmm7, %xmm6
3716 ; SSE-NEXT: pand %xmm8, %xmm5
3717 ; SSE-NEXT: pand %xmm8, %xmm4
3718 ; SSE-NEXT: packuswb %xmm5, %xmm4
3719 ; SSE-NEXT: packuswb %xmm6, %xmm4
3720 ; SSE-NEXT: pand %xmm8, %xmm3
3721 ; SSE-NEXT: pand %xmm8, %xmm2
3722 ; SSE-NEXT: packuswb %xmm3, %xmm2
3723 ; SSE-NEXT: pand %xmm8, %xmm1
3724 ; SSE-NEXT: pand %xmm8, %xmm0
3725 ; SSE-NEXT: packuswb %xmm1, %xmm0
3726 ; SSE-NEXT: packuswb %xmm2, %xmm0
3727 ; SSE-NEXT: packuswb %xmm4, %xmm0
3728 ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3731 ; AVX1-LABEL: trunc_xor_const_v16i64_v16i8:
3733 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255]
3734 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
3735 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
3736 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
3737 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
3738 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
3739 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
3740 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
3741 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
3742 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3743 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3744 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
3745 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
3746 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
3747 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3748 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
3749 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3750 ; AVX1-NEXT: vzeroupper
3753 ; AVX2-LABEL: trunc_xor_const_v16i64_v16i8:
3755 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3756 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
3757 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
3758 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
3759 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
3760 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
3761 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
3762 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
3763 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
3764 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
3765 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3766 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3767 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3768 ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3769 ; AVX2-NEXT: vzeroupper
3772 ; AVX512-LABEL: trunc_xor_const_v16i64_v16i8:
3774 ; AVX512-NEXT: vpmovqb %zmm1, %xmm1
3775 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
3776 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3777 ; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3778 ; AVX512-NEXT: vzeroupper
3780 %1 = xor <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
3781 %2 = trunc <16 x i64> %1 to <16 x i8>
3785 define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
3786 ; SSE-LABEL: trunc_xor_const_v16i32_v16i8:
3788 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3789 ; SSE-NEXT: pand %xmm4, %xmm3
3790 ; SSE-NEXT: pand %xmm4, %xmm2
3791 ; SSE-NEXT: packuswb %xmm3, %xmm2
3792 ; SSE-NEXT: pand %xmm4, %xmm1
3793 ; SSE-NEXT: pand %xmm4, %xmm0
3794 ; SSE-NEXT: packuswb %xmm1, %xmm0
3795 ; SSE-NEXT: packuswb %xmm2, %xmm0
3796 ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3799 ; AVX1-LABEL: trunc_xor_const_v16i32_v16i8:
3801 ; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
3802 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
3803 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3804 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3805 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
3806 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3807 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
3808 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3809 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3810 ; AVX1-NEXT: vzeroupper
3813 ; AVX2-LABEL: trunc_xor_const_v16i32_v16i8:
3815 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3816 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
3817 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
3818 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
3819 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3820 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3821 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3822 ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3823 ; AVX2-NEXT: vzeroupper
3826 ; AVX512-LABEL: trunc_xor_const_v16i32_v16i8:
3828 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
3829 ; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3830 ; AVX512-NEXT: vzeroupper
3832 %1 = xor <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3833 %2 = trunc <16 x i32> %1 to <16 x i8>
3837 define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
3838 ; SSE-LABEL: trunc_xor_const_v16i16_v16i8:
3840 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
3841 ; SSE-NEXT: pand %xmm2, %xmm1
3842 ; SSE-NEXT: pand %xmm2, %xmm0
3843 ; SSE-NEXT: packuswb %xmm1, %xmm0
3844 ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3847 ; AVX1-LABEL: trunc_xor_const_v16i16_v16i8:
3849 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3850 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3851 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3852 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3853 ; AVX1-NEXT: vzeroupper
3856 ; AVX2-LABEL: trunc_xor_const_v16i16_v16i8:
3858 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3859 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3860 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3861 ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3862 ; AVX2-NEXT: vzeroupper
3865 ; AVX512F-LABEL: trunc_xor_const_v16i16_v16i8:
3867 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3868 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
3869 ; AVX512F-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3870 ; AVX512F-NEXT: vzeroupper
3871 ; AVX512F-NEXT: retq
3873 ; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8:
3874 ; AVX512BW: # %bb.0:
3875 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
3876 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
3877 ; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3878 ; AVX512BW-NEXT: vzeroupper
3879 ; AVX512BW-NEXT: retq
3881 ; AVX512DQ-LABEL: trunc_xor_const_v16i16_v16i8:
3882 ; AVX512DQ: # %bb.0:
3883 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3884 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
3885 ; AVX512DQ-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3886 ; AVX512DQ-NEXT: vzeroupper
3887 ; AVX512DQ-NEXT: retq
3888 %1 = xor <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
3889 %2 = trunc <16 x i16> %1 to <16 x i8>
3897 define <4 x i32> @trunc_or_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
3898 ; SSE-LABEL: trunc_or_v4i64_v4i32:
3900 ; SSE-NEXT: orps %xmm3, %xmm1
3901 ; SSE-NEXT: orps %xmm2, %xmm0
3902 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3905 ; AVX1-LABEL: trunc_or_v4i64_v4i32:
3907 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
3908 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3909 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3910 ; AVX1-NEXT: vzeroupper
3913 ; AVX2-SLOW-LABEL: trunc_or_v4i64_v4i32:
3914 ; AVX2-SLOW: # %bb.0:
3915 ; AVX2-SLOW-NEXT: vorps %ymm1, %ymm0, %ymm0
3916 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
3917 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3918 ; AVX2-SLOW-NEXT: vzeroupper
3919 ; AVX2-SLOW-NEXT: retq
3921 ; AVX2-FAST-ALL-LABEL: trunc_or_v4i64_v4i32:
3922 ; AVX2-FAST-ALL: # %bb.0:
3923 ; AVX2-FAST-ALL-NEXT: vorps %ymm1, %ymm0, %ymm0
3924 ; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
3925 ; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
3926 ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
3927 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3928 ; AVX2-FAST-ALL-NEXT: vzeroupper
3929 ; AVX2-FAST-ALL-NEXT: retq
3931 ; AVX2-FAST-PERLANE-LABEL: trunc_or_v4i64_v4i32:
3932 ; AVX2-FAST-PERLANE: # %bb.0:
3933 ; AVX2-FAST-PERLANE-NEXT: vorps %ymm1, %ymm0, %ymm0
3934 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
3935 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3936 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
3937 ; AVX2-FAST-PERLANE-NEXT: retq
3939 ; AVX512-LABEL: trunc_or_v4i64_v4i32:
3941 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
3942 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
3943 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3944 ; AVX512-NEXT: vzeroupper
3946 %1 = or <4 x i64> %a0, %a1
3947 %2 = trunc <4 x i64> %1 to <4 x i32>
3951 define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
3952 ; SSE-LABEL: trunc_or_v8i64_v8i16:
3954 ; SSE-NEXT: orps %xmm5, %xmm1
3955 ; SSE-NEXT: orps %xmm4, %xmm0
3956 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3957 ; SSE-NEXT: orps %xmm7, %xmm3
3958 ; SSE-NEXT: orps %xmm6, %xmm2
3959 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
3960 ; SSE-NEXT: pslld $16, %xmm2
3961 ; SSE-NEXT: psrad $16, %xmm2
3962 ; SSE-NEXT: pslld $16, %xmm0
3963 ; SSE-NEXT: psrad $16, %xmm0
3964 ; SSE-NEXT: packssdw %xmm2, %xmm0
3967 ; AVX1-LABEL: trunc_or_v8i64_v8i16:
3969 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
3970 ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
3971 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
3972 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
3973 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3974 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3975 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
3976 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3977 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
3978 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3979 ; AVX1-NEXT: vzeroupper
3982 ; AVX2-LABEL: trunc_or_v8i64_v8i16:
3984 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
3985 ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
3986 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
3987 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
3988 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
3989 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
3990 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3991 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3992 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3993 ; AVX2-NEXT: vzeroupper
3996 ; AVX512-LABEL: trunc_or_v8i64_v8i16:
3998 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
3999 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
4000 ; AVX512-NEXT: vzeroupper
4002 %1 = or <8 x i64> %a0, %a1
4003 %2 = trunc <8 x i64> %1 to <8 x i16>
4007 define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
4008 ; SSE-LABEL: trunc_or_v8i32_v8i16:
4010 ; SSE-NEXT: por %xmm2, %xmm0
4011 ; SSE-NEXT: por %xmm3, %xmm1
4012 ; SSE-NEXT: pslld $16, %xmm1
4013 ; SSE-NEXT: psrad $16, %xmm1
4014 ; SSE-NEXT: pslld $16, %xmm0
4015 ; SSE-NEXT: psrad $16, %xmm0
4016 ; SSE-NEXT: packssdw %xmm1, %xmm0
4019 ; AVX1-LABEL: trunc_or_v8i32_v8i16:
4021 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
4022 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4023 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4024 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
4025 ; AVX1-NEXT: vzeroupper
4028 ; AVX2-LABEL: trunc_or_v8i32_v8i16:
4030 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
4031 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
4032 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4033 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
4034 ; AVX2-NEXT: vzeroupper
4037 ; AVX512-LABEL: trunc_or_v8i32_v8i16:
4039 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
4040 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
4041 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
4042 ; AVX512-NEXT: vzeroupper
4044 %1 = or <8 x i32> %a0, %a1
4045 %2 = trunc <8 x i32> %1 to <8 x i16>
4049 define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
4050 ; SSE-LABEL: trunc_or_v16i64_v16i8:
4052 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm0
4053 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm1
4054 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm2
4055 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm3
4056 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm4
4057 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm5
4058 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm6
4059 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm7
4060 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4061 ; SSE-NEXT: pand %xmm8, %xmm7
4062 ; SSE-NEXT: pand %xmm8, %xmm6
4063 ; SSE-NEXT: packuswb %xmm7, %xmm6
4064 ; SSE-NEXT: pand %xmm8, %xmm5
4065 ; SSE-NEXT: pand %xmm8, %xmm4
4066 ; SSE-NEXT: packuswb %xmm5, %xmm4
4067 ; SSE-NEXT: packuswb %xmm6, %xmm4
4068 ; SSE-NEXT: pand %xmm8, %xmm3
4069 ; SSE-NEXT: pand %xmm8, %xmm2
4070 ; SSE-NEXT: packuswb %xmm3, %xmm2
4071 ; SSE-NEXT: pand %xmm8, %xmm1
4072 ; SSE-NEXT: pand %xmm8, %xmm0
4073 ; SSE-NEXT: packuswb %xmm1, %xmm0
4074 ; SSE-NEXT: packuswb %xmm2, %xmm0
4075 ; SSE-NEXT: packuswb %xmm4, %xmm0
4078 ; AVX1-LABEL: trunc_or_v16i64_v16i8:
4080 ; AVX1-NEXT: vorps %ymm4, %ymm0, %ymm0
4081 ; AVX1-NEXT: vorps %ymm5, %ymm1, %ymm1
4082 ; AVX1-NEXT: vorps %ymm6, %ymm2, %ymm2
4083 ; AVX1-NEXT: vorps %ymm7, %ymm3, %ymm3
4084 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255]
4085 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
4086 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
4087 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
4088 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
4089 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
4090 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
4091 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4092 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
4093 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
4094 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
4095 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
4096 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
4097 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
4098 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
4099 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
4100 ; AVX1-NEXT: vzeroupper
4103 ; AVX2-LABEL: trunc_or_v16i64_v16i8:
4105 ; AVX2-NEXT: vpor %ymm4, %ymm0, %ymm0
4106 ; AVX2-NEXT: vpor %ymm5, %ymm1, %ymm1
4107 ; AVX2-NEXT: vpor %ymm6, %ymm2, %ymm2
4108 ; AVX2-NEXT: vpor %ymm7, %ymm3, %ymm3
4109 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4110 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
4111 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
4112 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
4113 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
4114 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
4115 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
4116 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
4117 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
4118 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
4119 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4120 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4121 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4122 ; AVX2-NEXT: vzeroupper
4125 ; AVX512-LABEL: trunc_or_v16i64_v16i8:
4127 ; AVX512-NEXT: vporq %zmm2, %zmm0, %zmm0
4128 ; AVX512-NEXT: vporq %zmm3, %zmm1, %zmm1
4129 ; AVX512-NEXT: vpmovqb %zmm1, %xmm1
4130 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
4131 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4132 ; AVX512-NEXT: vzeroupper
4134 %1 = or <16 x i64> %a0, %a1
4135 %2 = trunc <16 x i64> %1 to <16 x i8>
4139 define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
4140 ; SSE-LABEL: trunc_or_v16i32_v16i8:
4142 ; SSE-NEXT: por %xmm4, %xmm0
4143 ; SSE-NEXT: por %xmm5, %xmm1
4144 ; SSE-NEXT: por %xmm6, %xmm2
4145 ; SSE-NEXT: por %xmm7, %xmm3
4146 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4147 ; SSE-NEXT: pand %xmm4, %xmm3
4148 ; SSE-NEXT: pand %xmm4, %xmm2
4149 ; SSE-NEXT: packuswb %xmm3, %xmm2
4150 ; SSE-NEXT: pand %xmm4, %xmm1
4151 ; SSE-NEXT: pand %xmm4, %xmm0
4152 ; SSE-NEXT: packuswb %xmm1, %xmm0
4153 ; SSE-NEXT: packuswb %xmm2, %xmm0
4156 ; AVX1-LABEL: trunc_or_v16i32_v16i8:
4158 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
4159 ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
4160 ; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
4161 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
4162 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
4163 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
4164 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
4165 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
4166 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
4167 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4168 ; AVX1-NEXT: vzeroupper
4171 ; AVX2-LABEL: trunc_or_v16i32_v16i8:
4173 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
4174 ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
4175 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4176 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
4177 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
4178 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
4179 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4180 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4181 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4182 ; AVX2-NEXT: vzeroupper
4185 ; AVX512-LABEL: trunc_or_v16i32_v16i8:
4187 ; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0
4188 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
4189 ; AVX512-NEXT: vzeroupper
4191 %1 = or <16 x i32> %a0, %a1
4192 %2 = trunc <16 x i32> %1 to <16 x i8>
4196 define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
4197 ; SSE-LABEL: trunc_or_v16i16_v16i8:
4199 ; SSE-NEXT: por %xmm2, %xmm0
4200 ; SSE-NEXT: por %xmm3, %xmm1
4201 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
4202 ; SSE-NEXT: pand %xmm2, %xmm1
4203 ; SSE-NEXT: pand %xmm2, %xmm0
4204 ; SSE-NEXT: packuswb %xmm1, %xmm0
4207 ; AVX1-LABEL: trunc_or_v16i16_v16i8:
4209 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
4210 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4211 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4212 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4213 ; AVX1-NEXT: vzeroupper
4216 ; AVX2-LABEL: trunc_or_v16i16_v16i8:
4218 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
4219 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4220 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4221 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4222 ; AVX2-NEXT: vzeroupper
4225 ; AVX512F-LABEL: trunc_or_v16i16_v16i8:
4227 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
4228 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4229 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
4230 ; AVX512F-NEXT: vzeroupper
4231 ; AVX512F-NEXT: retq
4233 ; AVX512BW-LABEL: trunc_or_v16i16_v16i8:
4234 ; AVX512BW: # %bb.0:
4235 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
4236 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
4237 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
4238 ; AVX512BW-NEXT: vzeroupper
4239 ; AVX512BW-NEXT: retq
4241 ; AVX512DQ-LABEL: trunc_or_v16i16_v16i8:
4242 ; AVX512DQ: # %bb.0:
4243 ; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0
4244 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4245 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
4246 ; AVX512DQ-NEXT: vzeroupper
4247 ; AVX512DQ-NEXT: retq
4248 %1 = or <16 x i16> %a0, %a1
4249 %2 = trunc <16 x i16> %1 to <16 x i8>
4257 define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
4258 ; SSE-LABEL: trunc_or_const_v4i64_v4i32:
4260 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4261 ; SSE-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4264 ; AVX1-LABEL: trunc_or_const_v4i64_v4i32:
4266 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4267 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4268 ; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4269 ; AVX1-NEXT: vzeroupper
4272 ; AVX2-SLOW-LABEL: trunc_or_const_v4i64_v4i32:
4273 ; AVX2-SLOW: # %bb.0:
4274 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
4275 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4276 ; AVX2-SLOW-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4277 ; AVX2-SLOW-NEXT: vzeroupper
4278 ; AVX2-SLOW-NEXT: retq
4280 ; AVX2-FAST-ALL-LABEL: trunc_or_const_v4i64_v4i32:
4281 ; AVX2-FAST-ALL: # %bb.0:
4282 ; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
4283 ; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
4284 ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
4285 ; AVX2-FAST-ALL-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4286 ; AVX2-FAST-ALL-NEXT: vzeroupper
4287 ; AVX2-FAST-ALL-NEXT: retq
4289 ; AVX2-FAST-PERLANE-LABEL: trunc_or_const_v4i64_v4i32:
4290 ; AVX2-FAST-PERLANE: # %bb.0:
4291 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
4292 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4293 ; AVX2-FAST-PERLANE-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4294 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
4295 ; AVX2-FAST-PERLANE-NEXT: retq
4297 ; AVX512-LABEL: trunc_or_const_v4i64_v4i32:
4299 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
4300 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
4301 ; AVX512-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4302 ; AVX512-NEXT: vzeroupper
4304 %1 = or <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
4305 %2 = trunc <4 x i64> %1 to <4 x i32>
4309 define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
4310 ; SSE-LABEL: trunc_or_const_v8i64_v8i16:
4312 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
4313 ; SSE-NEXT: pslld $16, %xmm2
4314 ; SSE-NEXT: psrad $16, %xmm2
4315 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4316 ; SSE-NEXT: pslld $16, %xmm0
4317 ; SSE-NEXT: psrad $16, %xmm0
4318 ; SSE-NEXT: packssdw %xmm2, %xmm0
4319 ; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4322 ; AVX1-LABEL: trunc_or_const_v8i64_v8i16:
4324 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
4325 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
4326 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
4327 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
4328 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
4329 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
4330 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
4331 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
4332 ; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4333 ; AVX1-NEXT: vzeroupper
4336 ; AVX2-LABEL: trunc_or_const_v8i64_v8i16:
4338 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
4339 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
4340 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
4341 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
4342 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4343 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
4344 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4345 ; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4346 ; AVX2-NEXT: vzeroupper
4349 ; AVX512-LABEL: trunc_or_const_v8i64_v8i16:
4351 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
4352 ; AVX512-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4353 ; AVX512-NEXT: vzeroupper
4355 %1 = or <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
4356 %2 = trunc <8 x i64> %1 to <8 x i16>
4360 define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
4361 ; SSE-LABEL: trunc_or_const_v8i32_v8i16:
4363 ; SSE-NEXT: pslld $16, %xmm1
4364 ; SSE-NEXT: psrad $16, %xmm1
4365 ; SSE-NEXT: pslld $16, %xmm0
4366 ; SSE-NEXT: psrad $16, %xmm0
4367 ; SSE-NEXT: packssdw %xmm1, %xmm0
4368 ; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4371 ; AVX1-LABEL: trunc_or_const_v8i32_v8i16:
4373 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4374 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4375 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
4376 ; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4377 ; AVX1-NEXT: vzeroupper
4380 ; AVX2-LABEL: trunc_or_const_v8i32_v8i16:
4382 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
4383 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4384 ; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4385 ; AVX2-NEXT: vzeroupper
4388 ; AVX512-LABEL: trunc_or_const_v8i32_v8i16:
4390 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
4391 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
4392 ; AVX512-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4393 ; AVX512-NEXT: vzeroupper
4395 %1 = or <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
4396 %2 = trunc <8 x i32> %1 to <8 x i16>
4400 define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
4401 ; SSE-LABEL: trunc_or_const_v16i64_v16i8:
4403 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4404 ; SSE-NEXT: pand %xmm8, %xmm7
4405 ; SSE-NEXT: pand %xmm8, %xmm6
4406 ; SSE-NEXT: packuswb %xmm7, %xmm6
4407 ; SSE-NEXT: pand %xmm8, %xmm5
4408 ; SSE-NEXT: pand %xmm8, %xmm4
4409 ; SSE-NEXT: packuswb %xmm5, %xmm4
4410 ; SSE-NEXT: packuswb %xmm6, %xmm4
4411 ; SSE-NEXT: pand %xmm8, %xmm3
4412 ; SSE-NEXT: pand %xmm8, %xmm2
4413 ; SSE-NEXT: packuswb %xmm3, %xmm2
4414 ; SSE-NEXT: pand %xmm8, %xmm1
4415 ; SSE-NEXT: pand %xmm8, %xmm0
4416 ; SSE-NEXT: packuswb %xmm1, %xmm0
4417 ; SSE-NEXT: packuswb %xmm2, %xmm0
4418 ; SSE-NEXT: packuswb %xmm4, %xmm0
4419 ; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4422 ; AVX1-LABEL: trunc_or_const_v16i64_v16i8:
4424 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255]
4425 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
4426 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
4427 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
4428 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
4429 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
4430 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
4431 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4432 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
4433 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
4434 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
4435 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
4436 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
4437 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
4438 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
4439 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
4440 ; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4441 ; AVX1-NEXT: vzeroupper
4444 ; AVX2-LABEL: trunc_or_const_v16i64_v16i8:
4446 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4447 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
4448 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
4449 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
4450 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
4451 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
4452 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
4453 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
4454 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
4455 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
4456 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4457 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4458 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4459 ; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4460 ; AVX2-NEXT: vzeroupper
4463 ; AVX512-LABEL: trunc_or_const_v16i64_v16i8:
4465 ; AVX512-NEXT: vpmovqb %zmm1, %xmm1
4466 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
4467 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4468 ; AVX512-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4469 ; AVX512-NEXT: vzeroupper
4471 %1 = or <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
4472 %2 = trunc <16 x i64> %1 to <16 x i8>
4476 define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
4477 ; SSE-LABEL: trunc_or_const_v16i32_v16i8:
4479 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4480 ; SSE-NEXT: pand %xmm4, %xmm3
4481 ; SSE-NEXT: pand %xmm4, %xmm2
4482 ; SSE-NEXT: packuswb %xmm3, %xmm2
4483 ; SSE-NEXT: pand %xmm4, %xmm1
4484 ; SSE-NEXT: pand %xmm4, %xmm0
4485 ; SSE-NEXT: packuswb %xmm1, %xmm0
4486 ; SSE-NEXT: packuswb %xmm2, %xmm0
4487 ; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4490 ; AVX1-LABEL: trunc_or_const_v16i32_v16i8:
4492 ; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
4493 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
4494 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
4495 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
4496 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
4497 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
4498 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
4499 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4500 ; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4501 ; AVX1-NEXT: vzeroupper
4504 ; AVX2-LABEL: trunc_or_const_v16i32_v16i8:
4506 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4507 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
4508 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
4509 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
4510 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4511 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4512 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4513 ; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4514 ; AVX2-NEXT: vzeroupper
4517 ; AVX512-LABEL: trunc_or_const_v16i32_v16i8:
4519 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
4520 ; AVX512-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4521 ; AVX512-NEXT: vzeroupper
4523 %1 = or <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
4524 %2 = trunc <16 x i32> %1 to <16 x i8>
4528 define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
4529 ; SSE-LABEL: trunc_or_const_v16i16_v16i8:
4531 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
4532 ; SSE-NEXT: pand %xmm2, %xmm1
4533 ; SSE-NEXT: pand %xmm2, %xmm0
4534 ; SSE-NEXT: packuswb %xmm1, %xmm0
4535 ; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4538 ; AVX1-LABEL: trunc_or_const_v16i16_v16i8:
4540 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4541 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4542 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4543 ; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4544 ; AVX1-NEXT: vzeroupper
4547 ; AVX2-LABEL: trunc_or_const_v16i16_v16i8:
4549 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4550 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4551 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4552 ; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4553 ; AVX2-NEXT: vzeroupper
4556 ; AVX512F-LABEL: trunc_or_const_v16i16_v16i8:
4558 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4559 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
4560 ; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4561 ; AVX512F-NEXT: vzeroupper
4562 ; AVX512F-NEXT: retq
4564 ; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8:
4565 ; AVX512BW: # %bb.0:
4566 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
4567 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
4568 ; AVX512BW-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4569 ; AVX512BW-NEXT: vzeroupper
4570 ; AVX512BW-NEXT: retq
4572 ; AVX512DQ-LABEL: trunc_or_const_v16i16_v16i8:
4573 ; AVX512DQ: # %bb.0:
4574 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4575 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
4576 ; AVX512DQ-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4577 ; AVX512DQ-NEXT: vzeroupper
4578 ; AVX512DQ-NEXT: retq
4579 %1 = or <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
4580 %2 = trunc <16 x i16> %1 to <16 x i8>
4585 ; complex patterns - often created by vectorizer
4588 define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
4589 ; SSE-LABEL: mul_add_const_v4i64_v4i32:
4591 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
4592 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
4593 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
4594 ; SSE-NEXT: pmuludq %xmm2, %xmm0
4595 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
4596 ; SSE-NEXT: pmuludq %xmm3, %xmm1
4597 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4598 ; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4601 ; AVX-LABEL: mul_add_const_v4i64_v4i32:
4603 ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
4604 ; AVX-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4606 %1 = sext <4 x i32> %a0 to <4 x i64>
4607 %2 = sext <4 x i32> %a1 to <4 x i64>
4608 %3 = mul <4 x i64> %1, %2
4609 %4 = add <4 x i64> %3, <i64 -3, i64 -1, i64 1, i64 3>
4610 %5 = trunc <4 x i64> %4 to <4 x i32>
4614 define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
4615 ; SSE-LABEL: mul_add_self_v4i64_v4i32:
4617 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
4618 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
4619 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
4620 ; SSE-NEXT: pmuludq %xmm2, %xmm0
4621 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
4622 ; SSE-NEXT: pmuludq %xmm3, %xmm1
4623 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4624 ; SSE-NEXT: paddd %xmm0, %xmm0
4627 ; AVX-LABEL: mul_add_self_v4i64_v4i32:
4629 ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
4630 ; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0
4632 %1 = sext <4 x i32> %a0 to <4 x i64>
4633 %2 = sext <4 x i32> %a1 to <4 x i64>
4634 %3 = mul <4 x i64> %1, %2
4635 %4 = add <4 x i64> %3, %3
4636 %5 = trunc <4 x i64> %4 to <4 x i32>
4640 define <4 x i32> @mul_add_multiuse_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
4641 ; SSE-LABEL: mul_add_multiuse_v4i64_v4i32:
4643 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
4644 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
4645 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3]
4646 ; SSE-NEXT: pmuludq %xmm2, %xmm4
4647 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
4648 ; SSE-NEXT: pmuludq %xmm3, %xmm1
4649 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[0,2]
4650 ; SSE-NEXT: paddd %xmm4, %xmm0
4653 ; AVX-LABEL: mul_add_multiuse_v4i64_v4i32:
4655 ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm1
4656 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
4658 %1 = sext <4 x i32> %a0 to <4 x i64>
4659 %2 = sext <4 x i32> %a1 to <4 x i64>
4660 %3 = mul <4 x i64> %1, %2
4661 %4 = add <4 x i64> %1, %3
4662 %5 = trunc <4 x i64> %4 to <4 x i32>