1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-ALL
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-PERLANE
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ
17 define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
18 ; SSE-LABEL: trunc_add_v4i64_v4i32:
20 ; SSE-NEXT: paddq %xmm3, %xmm1
21 ; SSE-NEXT: paddq %xmm2, %xmm0
22 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
25 ; AVX1-LABEL: trunc_add_v4i64_v4i32:
27 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
28 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
29 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
30 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
31 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
32 ; AVX1-NEXT: vzeroupper
35 ; AVX2-SLOW-LABEL: trunc_add_v4i64_v4i32:
37 ; AVX2-SLOW-NEXT: vpaddq %ymm1, %ymm0, %ymm0
38 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
39 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
40 ; AVX2-SLOW-NEXT: vzeroupper
41 ; AVX2-SLOW-NEXT: retq
43 ; AVX2-FAST-ALL-LABEL: trunc_add_v4i64_v4i32:
44 ; AVX2-FAST-ALL: # %bb.0:
45 ; AVX2-FAST-ALL-NEXT: vpaddq %ymm1, %ymm0, %ymm0
46 ; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
47 ; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
48 ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
49 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
50 ; AVX2-FAST-ALL-NEXT: vzeroupper
51 ; AVX2-FAST-ALL-NEXT: retq
53 ; AVX2-FAST-PERLANE-LABEL: trunc_add_v4i64_v4i32:
54 ; AVX2-FAST-PERLANE: # %bb.0:
55 ; AVX2-FAST-PERLANE-NEXT: vpaddq %ymm1, %ymm0, %ymm0
56 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1
57 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
58 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
59 ; AVX2-FAST-PERLANE-NEXT: retq
61 ; AVX512-LABEL: trunc_add_v4i64_v4i32:
63 ; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0
64 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
65 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
66 ; AVX512-NEXT: vzeroupper
68 %1 = add <4 x i64> %a0, %a1
69 %2 = trunc <4 x i64> %1 to <4 x i32>
73 define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
74 ; SSE-LABEL: trunc_add_v8i64_v8i16:
76 ; SSE-NEXT: paddq %xmm5, %xmm1
77 ; SSE-NEXT: paddq %xmm4, %xmm0
78 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
79 ; SSE-NEXT: paddq %xmm7, %xmm3
80 ; SSE-NEXT: paddq %xmm6, %xmm2
81 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
82 ; SSE-NEXT: pslld $16, %xmm2
83 ; SSE-NEXT: psrad $16, %xmm2
84 ; SSE-NEXT: pslld $16, %xmm0
85 ; SSE-NEXT: psrad $16, %xmm0
86 ; SSE-NEXT: packssdw %xmm2, %xmm0
89 ; AVX1-LABEL: trunc_add_v8i64_v8i16:
91 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4
92 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
93 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
94 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
95 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm2
96 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
97 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
98 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
99 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
100 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
101 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
102 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
103 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
104 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
105 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
106 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
107 ; AVX1-NEXT: vzeroupper
110 ; AVX2-LABEL: trunc_add_v8i64_v8i16:
112 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
113 ; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
114 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
115 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
116 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
117 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
118 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
119 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
120 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
121 ; AVX2-NEXT: vzeroupper
124 ; AVX512-LABEL: trunc_add_v8i64_v8i16:
126 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
127 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
128 ; AVX512-NEXT: vzeroupper
130 %1 = add <8 x i64> %a0, %a1
131 %2 = trunc <8 x i64> %1 to <8 x i16>
135 define <8 x i16> @trunc_add_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
136 ; SSE-LABEL: trunc_add_v8i32_v8i16:
138 ; SSE-NEXT: paddd %xmm2, %xmm0
139 ; SSE-NEXT: paddd %xmm3, %xmm1
140 ; SSE-NEXT: pslld $16, %xmm1
141 ; SSE-NEXT: psrad $16, %xmm1
142 ; SSE-NEXT: pslld $16, %xmm0
143 ; SSE-NEXT: psrad $16, %xmm0
144 ; SSE-NEXT: packssdw %xmm1, %xmm0
147 ; AVX1-LABEL: trunc_add_v8i32_v8i16:
149 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2
150 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
151 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
152 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
153 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
154 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
155 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
156 ; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
157 ; AVX1-NEXT: vzeroupper
160 ; AVX2-LABEL: trunc_add_v8i32_v8i16:
162 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
163 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
164 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
165 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
166 ; AVX2-NEXT: vzeroupper
169 ; AVX512-LABEL: trunc_add_v8i32_v8i16:
171 ; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
172 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
173 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
174 ; AVX512-NEXT: vzeroupper
176 %1 = add <8 x i32> %a0, %a1
177 %2 = trunc <8 x i32> %1 to <8 x i16>
181 define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
182 ; SSE-LABEL: trunc_add_v16i64_v16i8:
184 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm0
185 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm1
186 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm2
187 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm3
188 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm4
189 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm5
190 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm6
191 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm7
192 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
193 ; SSE-NEXT: pand %xmm8, %xmm7
194 ; SSE-NEXT: pand %xmm8, %xmm6
195 ; SSE-NEXT: packuswb %xmm7, %xmm6
196 ; SSE-NEXT: pand %xmm8, %xmm5
197 ; SSE-NEXT: pand %xmm8, %xmm4
198 ; SSE-NEXT: packuswb %xmm5, %xmm4
199 ; SSE-NEXT: packuswb %xmm6, %xmm4
200 ; SSE-NEXT: pand %xmm8, %xmm3
201 ; SSE-NEXT: pand %xmm8, %xmm2
202 ; SSE-NEXT: packuswb %xmm3, %xmm2
203 ; SSE-NEXT: pand %xmm8, %xmm1
204 ; SSE-NEXT: pand %xmm8, %xmm0
205 ; SSE-NEXT: packuswb %xmm1, %xmm0
206 ; SSE-NEXT: packuswb %xmm2, %xmm0
207 ; SSE-NEXT: packuswb %xmm4, %xmm0
210 ; AVX1-LABEL: trunc_add_v16i64_v16i8:
212 ; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm8
213 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
214 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
215 ; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0
216 ; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm4
217 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
218 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
219 ; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1
220 ; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm5
221 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6
222 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
223 ; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm2
224 ; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm6
225 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7
226 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
227 ; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm3
228 ; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255]
229 ; AVX1-NEXT: # xmm7 = mem[0,0]
230 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
231 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
232 ; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3
233 ; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
234 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
235 ; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2
236 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
237 ; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1
238 ; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3
239 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
240 ; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0
241 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3
242 ; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0
243 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
244 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
245 ; AVX1-NEXT: vzeroupper
248 ; AVX2-LABEL: trunc_add_v16i64_v16i8:
250 ; AVX2-NEXT: vpaddq %ymm4, %ymm0, %ymm0
251 ; AVX2-NEXT: vpaddq %ymm5, %ymm1, %ymm1
252 ; AVX2-NEXT: vpaddq %ymm6, %ymm2, %ymm2
253 ; AVX2-NEXT: vpaddq %ymm7, %ymm3, %ymm3
254 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
255 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
256 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
257 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
258 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
259 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
260 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
261 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
262 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
263 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
264 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
265 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
266 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
267 ; AVX2-NEXT: vzeroupper
270 ; AVX512-LABEL: trunc_add_v16i64_v16i8:
272 ; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0
273 ; AVX512-NEXT: vpaddq %zmm3, %zmm1, %zmm1
274 ; AVX512-NEXT: vpmovqb %zmm1, %xmm1
275 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
276 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
277 ; AVX512-NEXT: vzeroupper
279 %1 = add <16 x i64> %a0, %a1
280 %2 = trunc <16 x i64> %1 to <16 x i8>
284 define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
285 ; SSE-LABEL: trunc_add_v16i32_v16i8:
287 ; SSE-NEXT: paddd %xmm4, %xmm0
288 ; SSE-NEXT: paddd %xmm5, %xmm1
289 ; SSE-NEXT: paddd %xmm6, %xmm2
290 ; SSE-NEXT: paddd %xmm7, %xmm3
291 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
292 ; SSE-NEXT: pand %xmm4, %xmm3
293 ; SSE-NEXT: pand %xmm4, %xmm2
294 ; SSE-NEXT: packuswb %xmm3, %xmm2
295 ; SSE-NEXT: pand %xmm4, %xmm1
296 ; SSE-NEXT: pand %xmm4, %xmm0
297 ; SSE-NEXT: packuswb %xmm1, %xmm0
298 ; SSE-NEXT: packuswb %xmm2, %xmm0
301 ; AVX1-LABEL: trunc_add_v16i32_v16i8:
303 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4
304 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
305 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
306 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
307 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm2
308 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
309 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
310 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
311 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255]
312 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
313 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
314 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
315 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
316 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2
317 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
318 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
319 ; AVX1-NEXT: vzeroupper
322 ; AVX2-LABEL: trunc_add_v16i32_v16i8:
324 ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
325 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1
326 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
327 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
328 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
329 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
330 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
331 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
332 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
333 ; AVX2-NEXT: vzeroupper
336 ; AVX512-LABEL: trunc_add_v16i32_v16i8:
338 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
339 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
340 ; AVX512-NEXT: vzeroupper
342 %1 = add <16 x i32> %a0, %a1
343 %2 = trunc <16 x i32> %1 to <16 x i8>
347 define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
348 ; SSE-LABEL: trunc_add_v16i16_v16i8:
350 ; SSE-NEXT: paddw %xmm2, %xmm0
351 ; SSE-NEXT: paddw %xmm3, %xmm1
352 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
353 ; SSE-NEXT: pand %xmm2, %xmm1
354 ; SSE-NEXT: pand %xmm2, %xmm0
355 ; SSE-NEXT: packuswb %xmm1, %xmm0
358 ; AVX1-LABEL: trunc_add_v16i16_v16i8:
360 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2
361 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
362 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
363 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
364 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
365 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
366 ; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1
367 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
368 ; AVX1-NEXT: vzeroupper
371 ; AVX2-LABEL: trunc_add_v16i16_v16i8:
373 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
374 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
375 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
376 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
377 ; AVX2-NEXT: vzeroupper
380 ; AVX512F-LABEL: trunc_add_v16i16_v16i8:
382 ; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm0
383 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
384 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
385 ; AVX512F-NEXT: vzeroupper
388 ; AVX512BW-LABEL: trunc_add_v16i16_v16i8:
390 ; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm0
391 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
392 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
393 ; AVX512BW-NEXT: vzeroupper
394 ; AVX512BW-NEXT: retq
396 ; AVX512DQ-LABEL: trunc_add_v16i16_v16i8:
398 ; AVX512DQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0
399 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
400 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
401 ; AVX512DQ-NEXT: vzeroupper
402 ; AVX512DQ-NEXT: retq
403 %1 = add <16 x i16> %a0, %a1
404 %2 = trunc <16 x i16> %1 to <16 x i8>
408 define <8 x i16> @trunc_add_v8i32_v8i16_sext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
409 ; SSE-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
411 ; SSE-NEXT: pslld $16, %xmm2
412 ; SSE-NEXT: psrad $16, %xmm2
413 ; SSE-NEXT: pslld $16, %xmm1
414 ; SSE-NEXT: psrad $16, %xmm1
415 ; SSE-NEXT: packssdw %xmm2, %xmm1
416 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
417 ; SSE-NEXT: psraw $8, %xmm0
418 ; SSE-NEXT: paddw %xmm1, %xmm0
421 ; AVX1-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
423 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
424 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
425 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
426 ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0
427 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
428 ; AVX1-NEXT: vzeroupper
431 ; AVX2-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
433 ; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0
434 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
435 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
436 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
437 ; AVX2-NEXT: vzeroupper
440 ; AVX512-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
442 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
443 ; AVX512-NEXT: vpmovdw %zmm1, %ymm1
444 ; AVX512-NEXT: vpmovsxbw %xmm0, %xmm0
445 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
446 ; AVX512-NEXT: vzeroupper
448 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
449 %2 = sext <8 x i8> %1 to <8 x i32>
450 %3 = add <8 x i32> %2, %a1
451 %4 = trunc <8 x i32> %3 to <8 x i16>
459 define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
460 ; SSE-LABEL: trunc_add_const_v4i64_v4i32:
462 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
463 ; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
466 ; AVX1-LABEL: trunc_add_const_v4i64_v4i32:
468 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
469 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
470 ; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
471 ; AVX1-NEXT: vzeroupper
474 ; AVX2-SLOW-LABEL: trunc_add_const_v4i64_v4i32:
475 ; AVX2-SLOW: # %bb.0:
476 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
477 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
478 ; AVX2-SLOW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
479 ; AVX2-SLOW-NEXT: vzeroupper
480 ; AVX2-SLOW-NEXT: retq
482 ; AVX2-FAST-ALL-LABEL: trunc_add_const_v4i64_v4i32:
483 ; AVX2-FAST-ALL: # %bb.0:
484 ; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
485 ; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
486 ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
487 ; AVX2-FAST-ALL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
488 ; AVX2-FAST-ALL-NEXT: vzeroupper
489 ; AVX2-FAST-ALL-NEXT: retq
491 ; AVX2-FAST-PERLANE-LABEL: trunc_add_const_v4i64_v4i32:
492 ; AVX2-FAST-PERLANE: # %bb.0:
493 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
494 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
495 ; AVX2-FAST-PERLANE-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
496 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
497 ; AVX2-FAST-PERLANE-NEXT: retq
499 ; AVX512-LABEL: trunc_add_const_v4i64_v4i32:
501 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
502 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
503 ; AVX512-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
504 ; AVX512-NEXT: vzeroupper
506 %1 = add <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
507 %2 = trunc <4 x i64> %1 to <4 x i32>
511 define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
512 ; SSE-LABEL: trunc_add_const_v8i64_v8i16:
514 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
515 ; SSE-NEXT: pslld $16, %xmm2
516 ; SSE-NEXT: psrad $16, %xmm2
517 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
518 ; SSE-NEXT: pslld $16, %xmm0
519 ; SSE-NEXT: psrad $16, %xmm0
520 ; SSE-NEXT: packssdw %xmm2, %xmm0
521 ; SSE-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
524 ; AVX1-LABEL: trunc_add_const_v8i64_v8i16:
526 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
527 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
528 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
529 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
530 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
531 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
532 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
533 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
534 ; AVX1-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
535 ; AVX1-NEXT: vzeroupper
538 ; AVX2-LABEL: trunc_add_const_v8i64_v8i16:
540 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
541 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
542 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
543 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
544 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
545 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
546 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
547 ; AVX2-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
548 ; AVX2-NEXT: vzeroupper
551 ; AVX512-LABEL: trunc_add_const_v8i64_v8i16:
553 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
554 ; AVX512-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
555 ; AVX512-NEXT: vzeroupper
557 %1 = add <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
558 %2 = trunc <8 x i64> %1 to <8 x i16>
562 define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
563 ; SSE-LABEL: trunc_add_const_v8i32_v8i16:
565 ; SSE-NEXT: pslld $16, %xmm1
566 ; SSE-NEXT: psrad $16, %xmm1
567 ; SSE-NEXT: pslld $16, %xmm0
568 ; SSE-NEXT: psrad $16, %xmm0
569 ; SSE-NEXT: packssdw %xmm1, %xmm0
570 ; SSE-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
573 ; AVX1-LABEL: trunc_add_const_v8i32_v8i16:
575 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
576 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
577 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
578 ; AVX1-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
579 ; AVX1-NEXT: vzeroupper
582 ; AVX2-LABEL: trunc_add_const_v8i32_v8i16:
584 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
585 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
586 ; AVX2-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
587 ; AVX2-NEXT: vzeroupper
590 ; AVX512-LABEL: trunc_add_const_v8i32_v8i16:
592 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
593 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
594 ; AVX512-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
595 ; AVX512-NEXT: vzeroupper
597 %1 = add <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
598 %2 = trunc <8 x i32> %1 to <8 x i16>
602 define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
603 ; SSE-LABEL: trunc_add_const_v16i64_v16i8:
605 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
606 ; SSE-NEXT: pand %xmm8, %xmm7
607 ; SSE-NEXT: pand %xmm8, %xmm6
608 ; SSE-NEXT: packuswb %xmm7, %xmm6
609 ; SSE-NEXT: pand %xmm8, %xmm5
610 ; SSE-NEXT: pand %xmm8, %xmm4
611 ; SSE-NEXT: packuswb %xmm5, %xmm4
612 ; SSE-NEXT: packuswb %xmm6, %xmm4
613 ; SSE-NEXT: pand %xmm8, %xmm3
614 ; SSE-NEXT: pand %xmm8, %xmm2
615 ; SSE-NEXT: packuswb %xmm3, %xmm2
616 ; SSE-NEXT: pand %xmm8, %xmm1
617 ; SSE-NEXT: pand %xmm8, %xmm0
618 ; SSE-NEXT: packuswb %xmm1, %xmm0
619 ; SSE-NEXT: packuswb %xmm2, %xmm0
620 ; SSE-NEXT: packuswb %xmm4, %xmm0
621 ; SSE-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
624 ; AVX1-LABEL: trunc_add_const_v16i64_v16i8:
626 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255]
627 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
628 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
629 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
630 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
631 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
632 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
633 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
634 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
635 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
636 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
637 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
638 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
639 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
640 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
641 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
642 ; AVX1-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
643 ; AVX1-NEXT: vzeroupper
646 ; AVX2-LABEL: trunc_add_const_v16i64_v16i8:
648 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
649 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
650 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
651 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
652 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
653 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
654 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
655 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
656 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
657 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
658 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
659 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
660 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
661 ; AVX2-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
662 ; AVX2-NEXT: vzeroupper
665 ; AVX512-LABEL: trunc_add_const_v16i64_v16i8:
667 ; AVX512-NEXT: vpmovqb %zmm1, %xmm1
668 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
669 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
670 ; AVX512-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
671 ; AVX512-NEXT: vzeroupper
673 %1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
674 %2 = trunc <16 x i64> %1 to <16 x i8>
678 define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
679 ; SSE-LABEL: trunc_add_const_v16i32_v16i8:
681 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
682 ; SSE-NEXT: pand %xmm4, %xmm3
683 ; SSE-NEXT: pand %xmm4, %xmm2
684 ; SSE-NEXT: packuswb %xmm3, %xmm2
685 ; SSE-NEXT: pand %xmm4, %xmm1
686 ; SSE-NEXT: pand %xmm4, %xmm0
687 ; SSE-NEXT: packuswb %xmm1, %xmm0
688 ; SSE-NEXT: packuswb %xmm2, %xmm0
689 ; SSE-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
692 ; AVX1-LABEL: trunc_add_const_v16i32_v16i8:
694 ; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
695 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
696 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
697 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
698 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
699 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
700 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
701 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
702 ; AVX1-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
703 ; AVX1-NEXT: vzeroupper
706 ; AVX2-LABEL: trunc_add_const_v16i32_v16i8:
708 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
709 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
710 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
711 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
712 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
713 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
714 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
715 ; AVX2-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
716 ; AVX2-NEXT: vzeroupper
719 ; AVX512-LABEL: trunc_add_const_v16i32_v16i8:
721 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
722 ; AVX512-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
723 ; AVX512-NEXT: vzeroupper
725 %1 = add <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
726 %2 = trunc <16 x i32> %1 to <16 x i8>
730 define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
731 ; SSE-LABEL: trunc_add_const_v16i16_v16i8:
733 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
734 ; SSE-NEXT: pand %xmm2, %xmm1
735 ; SSE-NEXT: pand %xmm2, %xmm0
736 ; SSE-NEXT: packuswb %xmm1, %xmm0
737 ; SSE-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
740 ; AVX1-LABEL: trunc_add_const_v16i16_v16i8:
742 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
743 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
744 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
745 ; AVX1-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
746 ; AVX1-NEXT: vzeroupper
749 ; AVX2-LABEL: trunc_add_const_v16i16_v16i8:
751 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
752 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
753 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
754 ; AVX2-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
755 ; AVX2-NEXT: vzeroupper
758 ; AVX512F-LABEL: trunc_add_const_v16i16_v16i8:
760 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
761 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
762 ; AVX512F-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
763 ; AVX512F-NEXT: vzeroupper
766 ; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8:
768 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
769 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
770 ; AVX512BW-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
771 ; AVX512BW-NEXT: vzeroupper
772 ; AVX512BW-NEXT: retq
774 ; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8:
776 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
777 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
778 ; AVX512DQ-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
779 ; AVX512DQ-NEXT: vzeroupper
780 ; AVX512DQ-NEXT: retq
781 %1 = add <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
782 %2 = trunc <16 x i16> %1 to <16 x i8>
790 define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
791 ; SSE-LABEL: trunc_sub_v4i64_v4i32:
793 ; SSE-NEXT: psubq %xmm3, %xmm1
794 ; SSE-NEXT: psubq %xmm2, %xmm0
795 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
798 ; AVX1-LABEL: trunc_sub_v4i64_v4i32:
800 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
801 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
802 ; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
803 ; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
804 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
805 ; AVX1-NEXT: vzeroupper
808 ; AVX2-SLOW-LABEL: trunc_sub_v4i64_v4i32:
809 ; AVX2-SLOW: # %bb.0:
810 ; AVX2-SLOW-NEXT: vpsubq %ymm1, %ymm0, %ymm0
811 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
812 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
813 ; AVX2-SLOW-NEXT: vzeroupper
814 ; AVX2-SLOW-NEXT: retq
816 ; AVX2-FAST-ALL-LABEL: trunc_sub_v4i64_v4i32:
817 ; AVX2-FAST-ALL: # %bb.0:
818 ; AVX2-FAST-ALL-NEXT: vpsubq %ymm1, %ymm0, %ymm0
819 ; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
820 ; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
821 ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
822 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
823 ; AVX2-FAST-ALL-NEXT: vzeroupper
824 ; AVX2-FAST-ALL-NEXT: retq
826 ; AVX2-FAST-PERLANE-LABEL: trunc_sub_v4i64_v4i32:
827 ; AVX2-FAST-PERLANE: # %bb.0:
828 ; AVX2-FAST-PERLANE-NEXT: vpsubq %ymm1, %ymm0, %ymm0
829 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1
830 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
831 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
832 ; AVX2-FAST-PERLANE-NEXT: retq
834 ; AVX512-LABEL: trunc_sub_v4i64_v4i32:
836 ; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0
837 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
838 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
839 ; AVX512-NEXT: vzeroupper
841 %1 = sub <4 x i64> %a0, %a1
842 %2 = trunc <4 x i64> %1 to <4 x i32>
846 define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
847 ; SSE-LABEL: trunc_sub_v8i64_v8i16:
849 ; SSE-NEXT: psubq %xmm5, %xmm1
850 ; SSE-NEXT: psubq %xmm4, %xmm0
851 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
852 ; SSE-NEXT: psubq %xmm7, %xmm3
853 ; SSE-NEXT: psubq %xmm6, %xmm2
854 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
855 ; SSE-NEXT: pslld $16, %xmm2
856 ; SSE-NEXT: psrad $16, %xmm2
857 ; SSE-NEXT: pslld $16, %xmm0
858 ; SSE-NEXT: psrad $16, %xmm0
859 ; SSE-NEXT: packssdw %xmm2, %xmm0
862 ; AVX1-LABEL: trunc_sub_v8i64_v8i16:
864 ; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm4
865 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
866 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
867 ; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
868 ; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm2
869 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
870 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
871 ; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1
872 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
873 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
874 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
875 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
876 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
877 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
878 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
879 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
880 ; AVX1-NEXT: vzeroupper
883 ; AVX2-LABEL: trunc_sub_v8i64_v8i16:
885 ; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
886 ; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm1
887 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
888 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
889 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
890 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
891 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
892 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
893 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
894 ; AVX2-NEXT: vzeroupper
897 ; AVX512-LABEL: trunc_sub_v8i64_v8i16:
899 ; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0
900 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
901 ; AVX512-NEXT: vzeroupper
903 %1 = sub <8 x i64> %a0, %a1
904 %2 = trunc <8 x i64> %1 to <8 x i16>
908 define <8 x i16> @trunc_sub_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
909 ; SSE-LABEL: trunc_sub_v8i32_v8i16:
911 ; SSE-NEXT: psubd %xmm2, %xmm0
912 ; SSE-NEXT: psubd %xmm3, %xmm1
913 ; SSE-NEXT: pslld $16, %xmm1
914 ; SSE-NEXT: psrad $16, %xmm1
915 ; SSE-NEXT: pslld $16, %xmm0
916 ; SSE-NEXT: psrad $16, %xmm0
917 ; SSE-NEXT: packssdw %xmm1, %xmm0
920 ; AVX1-LABEL: trunc_sub_v8i32_v8i16:
922 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2
923 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
924 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
925 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
926 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
927 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
928 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
929 ; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
930 ; AVX1-NEXT: vzeroupper
933 ; AVX2-LABEL: trunc_sub_v8i32_v8i16:
935 ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
936 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
937 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
938 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
939 ; AVX2-NEXT: vzeroupper
942 ; AVX512-LABEL: trunc_sub_v8i32_v8i16:
944 ; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0
945 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
946 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
947 ; AVX512-NEXT: vzeroupper
949 %1 = sub <8 x i32> %a0, %a1
950 %2 = trunc <8 x i32> %1 to <8 x i16>
954 define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
955 ; SSE-LABEL: trunc_sub_v16i64_v16i8:
957 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm0
958 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm1
959 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm2
960 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm3
961 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm4
962 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm5
963 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm6
964 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm7
965 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
966 ; SSE-NEXT: pand %xmm8, %xmm7
967 ; SSE-NEXT: pand %xmm8, %xmm6
968 ; SSE-NEXT: packuswb %xmm7, %xmm6
969 ; SSE-NEXT: pand %xmm8, %xmm5
970 ; SSE-NEXT: pand %xmm8, %xmm4
971 ; SSE-NEXT: packuswb %xmm5, %xmm4
972 ; SSE-NEXT: packuswb %xmm6, %xmm4
973 ; SSE-NEXT: pand %xmm8, %xmm3
974 ; SSE-NEXT: pand %xmm8, %xmm2
975 ; SSE-NEXT: packuswb %xmm3, %xmm2
976 ; SSE-NEXT: pand %xmm8, %xmm1
977 ; SSE-NEXT: pand %xmm8, %xmm0
978 ; SSE-NEXT: packuswb %xmm1, %xmm0
979 ; SSE-NEXT: packuswb %xmm2, %xmm0
980 ; SSE-NEXT: packuswb %xmm4, %xmm0
983 ; AVX1-LABEL: trunc_sub_v16i64_v16i8:
985 ; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm8
986 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
987 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
988 ; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm0
989 ; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm4
990 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
991 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
992 ; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm1
993 ; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm5
994 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6
995 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
996 ; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm2
997 ; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm6
998 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7
999 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
1000 ; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm3
1001 ; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255]
1002 ; AVX1-NEXT: # xmm7 = mem[0,0]
1003 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
1004 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
1005 ; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3
1006 ; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
1007 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
1008 ; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2
1009 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1010 ; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1
1011 ; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3
1012 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
1013 ; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0
1014 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3
1015 ; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0
1016 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1017 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1018 ; AVX1-NEXT: vzeroupper
1021 ; AVX2-LABEL: trunc_sub_v16i64_v16i8:
1023 ; AVX2-NEXT: vpsubq %ymm4, %ymm0, %ymm0
1024 ; AVX2-NEXT: vpsubq %ymm5, %ymm1, %ymm1
1025 ; AVX2-NEXT: vpsubq %ymm6, %ymm2, %ymm2
1026 ; AVX2-NEXT: vpsubq %ymm7, %ymm3, %ymm3
1027 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1028 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
1029 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
1030 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
1031 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
1032 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
1033 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
1034 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
1035 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1036 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
1037 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1038 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1039 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1040 ; AVX2-NEXT: vzeroupper
1043 ; AVX512-LABEL: trunc_sub_v16i64_v16i8:
1045 ; AVX512-NEXT: vpsubq %zmm2, %zmm0, %zmm0
1046 ; AVX512-NEXT: vpsubq %zmm3, %zmm1, %zmm1
1047 ; AVX512-NEXT: vpmovqb %zmm1, %xmm1
1048 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
1049 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1050 ; AVX512-NEXT: vzeroupper
1052 %1 = sub <16 x i64> %a0, %a1
1053 %2 = trunc <16 x i64> %1 to <16 x i8>
1057 define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
1058 ; SSE-LABEL: trunc_sub_v16i32_v16i8:
1060 ; SSE-NEXT: psubd %xmm4, %xmm0
1061 ; SSE-NEXT: psubd %xmm5, %xmm1
1062 ; SSE-NEXT: psubd %xmm6, %xmm2
1063 ; SSE-NEXT: psubd %xmm7, %xmm3
1064 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1065 ; SSE-NEXT: pand %xmm4, %xmm3
1066 ; SSE-NEXT: pand %xmm4, %xmm2
1067 ; SSE-NEXT: packuswb %xmm3, %xmm2
1068 ; SSE-NEXT: pand %xmm4, %xmm1
1069 ; SSE-NEXT: pand %xmm4, %xmm0
1070 ; SSE-NEXT: packuswb %xmm1, %xmm0
1071 ; SSE-NEXT: packuswb %xmm2, %xmm0
1074 ; AVX1-LABEL: trunc_sub_v16i32_v16i8:
1076 ; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm4
1077 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
1078 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1079 ; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0
1080 ; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm2
1081 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
1082 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1083 ; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1
1084 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255]
1085 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
1086 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
1087 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
1088 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1089 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2
1090 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
1091 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1092 ; AVX1-NEXT: vzeroupper
1095 ; AVX2-LABEL: trunc_sub_v16i32_v16i8:
1097 ; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0
1098 ; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm1
1099 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1100 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
1101 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
1102 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
1103 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1104 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1105 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1106 ; AVX2-NEXT: vzeroupper
1109 ; AVX512-LABEL: trunc_sub_v16i32_v16i8:
1111 ; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0
1112 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
1113 ; AVX512-NEXT: vzeroupper
1115 %1 = sub <16 x i32> %a0, %a1
1116 %2 = trunc <16 x i32> %1 to <16 x i8>
1120 define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
1121 ; SSE-LABEL: trunc_sub_v16i16_v16i8:
1123 ; SSE-NEXT: psubw %xmm2, %xmm0
1124 ; SSE-NEXT: psubw %xmm3, %xmm1
1125 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
1126 ; SSE-NEXT: pand %xmm2, %xmm1
1127 ; SSE-NEXT: pand %xmm2, %xmm0
1128 ; SSE-NEXT: packuswb %xmm1, %xmm0
1131 ; AVX1-LABEL: trunc_sub_v16i16_v16i8:
1133 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2
1134 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1135 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1136 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0
1137 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
1138 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
1139 ; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1
1140 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
1141 ; AVX1-NEXT: vzeroupper
1144 ; AVX2-LABEL: trunc_sub_v16i16_v16i8:
1146 ; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0
1147 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1148 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1149 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1150 ; AVX2-NEXT: vzeroupper
1153 ; AVX512F-LABEL: trunc_sub_v16i16_v16i8:
1155 ; AVX512F-NEXT: vpsubw %ymm1, %ymm0, %ymm0
1156 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1157 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
1158 ; AVX512F-NEXT: vzeroupper
1159 ; AVX512F-NEXT: retq
1161 ; AVX512BW-LABEL: trunc_sub_v16i16_v16i8:
1162 ; AVX512BW: # %bb.0:
1163 ; AVX512BW-NEXT: vpsubw %ymm1, %ymm0, %ymm0
1164 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1165 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1166 ; AVX512BW-NEXT: vzeroupper
1167 ; AVX512BW-NEXT: retq
1169 ; AVX512DQ-LABEL: trunc_sub_v16i16_v16i8:
1170 ; AVX512DQ: # %bb.0:
1171 ; AVX512DQ-NEXT: vpsubw %ymm1, %ymm0, %ymm0
1172 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1173 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
1174 ; AVX512DQ-NEXT: vzeroupper
1175 ; AVX512DQ-NEXT: retq
1176 %1 = sub <16 x i16> %a0, %a1
1177 %2 = trunc <16 x i16> %1 to <16 x i8>
1181 define <16 x i8> @trunc_ext_sub_v16i16_v16i8(<16 x i8> %x, <16 x i8> %y) {
1182 ; SSE-LABEL: trunc_ext_sub_v16i16_v16i8:
1184 ; SSE-NEXT: psubb %xmm1, %xmm0
1187 ; AVX-LABEL: trunc_ext_sub_v16i16_v16i8:
1189 ; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
1191 %a = zext <16 x i8> %x to <16 x i16>
1192 %b = zext <16 x i8> %y to <16 x i16>
1193 %c = sub <16 x i16> %a, %b
1194 %d = trunc <16 x i16> %c to <16 x i8>
1202 define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
1203 ; SSE-LABEL: trunc_sub_const_v4i64_v4i32:
1205 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1206 ; SSE-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1209 ; AVX1-LABEL: trunc_sub_const_v4i64_v4i32:
1211 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1212 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1213 ; AVX1-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1214 ; AVX1-NEXT: vzeroupper
1217 ; AVX2-SLOW-LABEL: trunc_sub_const_v4i64_v4i32:
1218 ; AVX2-SLOW: # %bb.0:
1219 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
1220 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1221 ; AVX2-SLOW-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1222 ; AVX2-SLOW-NEXT: vzeroupper
1223 ; AVX2-SLOW-NEXT: retq
1225 ; AVX2-FAST-ALL-LABEL: trunc_sub_const_v4i64_v4i32:
1226 ; AVX2-FAST-ALL: # %bb.0:
1227 ; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
1228 ; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
1229 ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
1230 ; AVX2-FAST-ALL-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1231 ; AVX2-FAST-ALL-NEXT: vzeroupper
1232 ; AVX2-FAST-ALL-NEXT: retq
1234 ; AVX2-FAST-PERLANE-LABEL: trunc_sub_const_v4i64_v4i32:
1235 ; AVX2-FAST-PERLANE: # %bb.0:
1236 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
1237 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1238 ; AVX2-FAST-PERLANE-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1239 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
1240 ; AVX2-FAST-PERLANE-NEXT: retq
1242 ; AVX512-LABEL: trunc_sub_const_v4i64_v4i32:
1244 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1245 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
1246 ; AVX512-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1247 ; AVX512-NEXT: vzeroupper
1249 %1 = sub <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
1250 %2 = trunc <4 x i64> %1 to <4 x i32>
1254 define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
1255 ; SSE-LABEL: trunc_sub_const_v8i64_v8i16:
1257 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
1258 ; SSE-NEXT: pslld $16, %xmm2
1259 ; SSE-NEXT: psrad $16, %xmm2
1260 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1261 ; SSE-NEXT: pslld $16, %xmm0
1262 ; SSE-NEXT: psrad $16, %xmm0
1263 ; SSE-NEXT: packssdw %xmm2, %xmm0
1264 ; SSE-NEXT: psubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1267 ; AVX1-LABEL: trunc_sub_const_v8i64_v8i16:
1269 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
1270 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
1271 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1272 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
1273 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
1274 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1275 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
1276 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1277 ; AVX1-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1278 ; AVX1-NEXT: vzeroupper
1281 ; AVX2-LABEL: trunc_sub_const_v8i64_v8i16:
1283 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1284 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
1285 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
1286 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
1287 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1288 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1289 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1290 ; AVX2-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1291 ; AVX2-NEXT: vzeroupper
1294 ; AVX512-LABEL: trunc_sub_const_v8i64_v8i16:
1296 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
1297 ; AVX512-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1298 ; AVX512-NEXT: vzeroupper
1300 %1 = sub <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
1301 %2 = trunc <8 x i64> %1 to <8 x i16>
1305 define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
1306 ; SSE-LABEL: trunc_sub_const_v8i32_v8i16:
1308 ; SSE-NEXT: pslld $16, %xmm1
1309 ; SSE-NEXT: psrad $16, %xmm1
1310 ; SSE-NEXT: pslld $16, %xmm0
1311 ; SSE-NEXT: psrad $16, %xmm0
1312 ; SSE-NEXT: packssdw %xmm1, %xmm0
1313 ; SSE-NEXT: psubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1316 ; AVX1-LABEL: trunc_sub_const_v8i32_v8i16:
1318 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1319 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1320 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1321 ; AVX1-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1322 ; AVX1-NEXT: vzeroupper
1325 ; AVX2-LABEL: trunc_sub_const_v8i32_v8i16:
1327 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
1328 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1329 ; AVX2-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1330 ; AVX2-NEXT: vzeroupper
1333 ; AVX512-LABEL: trunc_sub_const_v8i32_v8i16:
1335 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1336 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
1337 ; AVX512-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1338 ; AVX512-NEXT: vzeroupper
1340 %1 = sub <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1341 %2 = trunc <8 x i32> %1 to <8 x i16>
1345 define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
1346 ; SSE-LABEL: trunc_sub_const_v16i64_v16i8:
1348 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1349 ; SSE-NEXT: pand %xmm8, %xmm7
1350 ; SSE-NEXT: pand %xmm8, %xmm6
1351 ; SSE-NEXT: packuswb %xmm7, %xmm6
1352 ; SSE-NEXT: pand %xmm8, %xmm5
1353 ; SSE-NEXT: pand %xmm8, %xmm4
1354 ; SSE-NEXT: packuswb %xmm5, %xmm4
1355 ; SSE-NEXT: packuswb %xmm6, %xmm4
1356 ; SSE-NEXT: pand %xmm8, %xmm3
1357 ; SSE-NEXT: pand %xmm8, %xmm2
1358 ; SSE-NEXT: packuswb %xmm3, %xmm2
1359 ; SSE-NEXT: pand %xmm8, %xmm1
1360 ; SSE-NEXT: pand %xmm8, %xmm0
1361 ; SSE-NEXT: packuswb %xmm1, %xmm0
1362 ; SSE-NEXT: packuswb %xmm2, %xmm0
1363 ; SSE-NEXT: packuswb %xmm4, %xmm0
1364 ; SSE-NEXT: psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1367 ; AVX1-LABEL: trunc_sub_const_v16i64_v16i8:
1369 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255]
1370 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
1371 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
1372 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
1373 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
1374 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
1375 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
1376 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1377 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
1378 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1379 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
1380 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
1381 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1382 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
1383 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1384 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1385 ; AVX1-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1386 ; AVX1-NEXT: vzeroupper
1389 ; AVX2-LABEL: trunc_sub_const_v16i64_v16i8:
1391 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1392 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
1393 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
1394 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
1395 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
1396 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
1397 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
1398 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
1399 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1400 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
1401 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1402 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1403 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1404 ; AVX2-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1405 ; AVX2-NEXT: vzeroupper
1408 ; AVX512-LABEL: trunc_sub_const_v16i64_v16i8:
1410 ; AVX512-NEXT: vpmovqb %zmm1, %xmm1
1411 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
1412 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1413 ; AVX512-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1414 ; AVX512-NEXT: vzeroupper
1416 %1 = sub <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
1417 %2 = trunc <16 x i64> %1 to <16 x i8>
1421 define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
1422 ; SSE-LABEL: trunc_sub_const_v16i32_v16i8:
1424 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1425 ; SSE-NEXT: pand %xmm4, %xmm3
1426 ; SSE-NEXT: pand %xmm4, %xmm2
1427 ; SSE-NEXT: packuswb %xmm3, %xmm2
1428 ; SSE-NEXT: pand %xmm4, %xmm1
1429 ; SSE-NEXT: pand %xmm4, %xmm0
1430 ; SSE-NEXT: packuswb %xmm1, %xmm0
1431 ; SSE-NEXT: packuswb %xmm2, %xmm0
1432 ; SSE-NEXT: psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1435 ; AVX1-LABEL: trunc_sub_const_v16i32_v16i8:
1437 ; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
1438 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
1439 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1440 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
1441 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
1442 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1443 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
1444 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1445 ; AVX1-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1446 ; AVX1-NEXT: vzeroupper
1449 ; AVX2-LABEL: trunc_sub_const_v16i32_v16i8:
1451 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1452 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
1453 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
1454 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
1455 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1456 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1457 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1458 ; AVX2-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1459 ; AVX2-NEXT: vzeroupper
1462 ; AVX512-LABEL: trunc_sub_const_v16i32_v16i8:
1464 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
1465 ; AVX512-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1466 ; AVX512-NEXT: vzeroupper
1468 %1 = sub <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1469 %2 = trunc <16 x i32> %1 to <16 x i8>
1473 define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
1474 ; SSE-LABEL: trunc_sub_const_v16i16_v16i8:
1476 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
1477 ; SSE-NEXT: pand %xmm2, %xmm1
1478 ; SSE-NEXT: pand %xmm2, %xmm0
1479 ; SSE-NEXT: packuswb %xmm1, %xmm0
1480 ; SSE-NEXT: psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1483 ; AVX1-LABEL: trunc_sub_const_v16i16_v16i8:
1485 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1486 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1487 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1488 ; AVX1-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1489 ; AVX1-NEXT: vzeroupper
1492 ; AVX2-LABEL: trunc_sub_const_v16i16_v16i8:
1494 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1495 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1496 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1497 ; AVX2-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1498 ; AVX2-NEXT: vzeroupper
1501 ; AVX512F-LABEL: trunc_sub_const_v16i16_v16i8:
1503 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1504 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
1505 ; AVX512F-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1506 ; AVX512F-NEXT: vzeroupper
1507 ; AVX512F-NEXT: retq
1509 ; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8:
1510 ; AVX512BW: # %bb.0:
1511 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1512 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1513 ; AVX512BW-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1514 ; AVX512BW-NEXT: vzeroupper
1515 ; AVX512BW-NEXT: retq
1517 ; AVX512DQ-LABEL: trunc_sub_const_v16i16_v16i8:
1518 ; AVX512DQ: # %bb.0:
1519 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1520 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
1521 ; AVX512DQ-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1522 ; AVX512DQ-NEXT: vzeroupper
1523 ; AVX512DQ-NEXT: retq
1524 %1 = sub <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1525 %2 = trunc <16 x i16> %1 to <16 x i8>
1529 define <16 x i8> @trunc_ext_sub_const_rhs_v16i16_v16i8(<16 x i8> %x) {
1530 ; SSE-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
1532 ; SSE-NEXT: psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1535 ; AVX-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
1537 ; AVX-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1539 %a = zext <16 x i8> %x to <16 x i16>
1540 %b = sub <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1541 %c = trunc <16 x i16> %b to <16 x i8>
1545 define <16 x i8> @trunc_ext_sub_const_lhs_v16i16_v16i8(<16 x i8> %x) {
1546 ; SSE-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
1548 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1549 ; SSE-NEXT: psubb %xmm0, %xmm1
1550 ; SSE-NEXT: movdqa %xmm1, %xmm0
1553 ; AVX-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
1555 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1556 ; AVX-NEXT: vpsubb %xmm0, %xmm1, %xmm0
1558 %a = zext <16 x i8> %x to <16 x i16>
1559 %b = sub <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %a
1560 %c = trunc <16 x i16> %b to <16 x i8>
1568 define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
1569 ; SSE-LABEL: trunc_mul_v4i64_v4i32:
1571 ; SSE-NEXT: pmuludq %xmm3, %xmm1
1572 ; SSE-NEXT: pmuludq %xmm2, %xmm0
1573 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1576 ; AVX1-LABEL: trunc_mul_v4i64_v4i32:
1578 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1579 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
1580 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1581 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
1582 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1583 ; AVX1-NEXT: vzeroupper
1586 ; AVX2-SLOW-LABEL: trunc_mul_v4i64_v4i32:
1587 ; AVX2-SLOW: # %bb.0:
1588 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
1589 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
1590 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
1591 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
1592 ; AVX2-SLOW-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1593 ; AVX2-SLOW-NEXT: vzeroupper
1594 ; AVX2-SLOW-NEXT: retq
1596 ; AVX2-FAST-ALL-LABEL: trunc_mul_v4i64_v4i32:
1597 ; AVX2-FAST-ALL: # %bb.0:
1598 ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
1599 ; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1
1600 ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm2, %ymm0
1601 ; AVX2-FAST-ALL-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1602 ; AVX2-FAST-ALL-NEXT: vzeroupper
1603 ; AVX2-FAST-ALL-NEXT: retq
1605 ; AVX2-FAST-PERLANE-LABEL: trunc_mul_v4i64_v4i32:
1606 ; AVX2-FAST-PERLANE: # %bb.0:
1607 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2
1608 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
1609 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm2
1610 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
1611 ; AVX2-FAST-PERLANE-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1612 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
1613 ; AVX2-FAST-PERLANE-NEXT: retq
1615 ; AVX512F-LABEL: trunc_mul_v4i64_v4i32:
1617 ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1618 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1619 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
1620 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
1621 ; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1622 ; AVX512F-NEXT: vzeroupper
1623 ; AVX512F-NEXT: retq
1625 ; AVX512BW-LABEL: trunc_mul_v4i64_v4i32:
1626 ; AVX512BW: # %bb.0:
1627 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1628 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1629 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
1630 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
1631 ; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1632 ; AVX512BW-NEXT: vzeroupper
1633 ; AVX512BW-NEXT: retq
1635 ; AVX512DQ-LABEL: trunc_mul_v4i64_v4i32:
1636 ; AVX512DQ: # %bb.0:
1637 ; AVX512DQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1638 ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1639 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
1640 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
1641 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1642 ; AVX512DQ-NEXT: vzeroupper
1643 ; AVX512DQ-NEXT: retq
1644 %1 = mul <4 x i64> %a0, %a1
1645 %2 = trunc <4 x i64> %1 to <4 x i32>
1649 define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
1650 ; SSE-LABEL: trunc_mul_v8i64_v8i16:
1652 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2]
1653 ; SSE-NEXT: pslld $16, %xmm6
1654 ; SSE-NEXT: psrad $16, %xmm6
1655 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2]
1656 ; SSE-NEXT: pslld $16, %xmm4
1657 ; SSE-NEXT: psrad $16, %xmm4
1658 ; SSE-NEXT: packssdw %xmm6, %xmm4
1659 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
1660 ; SSE-NEXT: pslld $16, %xmm2
1661 ; SSE-NEXT: psrad $16, %xmm2
1662 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1663 ; SSE-NEXT: pslld $16, %xmm0
1664 ; SSE-NEXT: psrad $16, %xmm0
1665 ; SSE-NEXT: packssdw %xmm2, %xmm0
1666 ; SSE-NEXT: pmullw %xmm4, %xmm0
1669 ; AVX1-LABEL: trunc_mul_v8i64_v8i16:
1671 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [65535,65535,65535,65535]
1672 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
1673 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
1674 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
1675 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
1676 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
1677 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
1678 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1679 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
1680 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1681 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
1682 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
1683 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1684 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
1685 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1686 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
1687 ; AVX1-NEXT: vzeroupper
1690 ; AVX2-LABEL: trunc_mul_v8i64_v8i16:
1692 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
1693 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7],ymm3[8],ymm4[9,10,11],ymm3[12],ymm4[13,14,15]
1694 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7],ymm2[8],ymm4[9,10,11],ymm2[12],ymm4[13,14,15]
1695 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
1696 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
1697 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1698 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1,2,3],ymm1[4],ymm4[5,6,7],ymm1[8],ymm4[9,10,11],ymm1[12],ymm4[13,14,15]
1699 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7],ymm0[8],ymm4[9,10,11],ymm0[12],ymm4[13,14,15]
1700 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
1701 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1702 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1703 ; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0
1704 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1705 ; AVX2-NEXT: vzeroupper
1708 ; AVX512F-LABEL: trunc_mul_v8i64_v8i16:
1710 ; AVX512F-NEXT: vpmovqw %zmm1, %xmm1
1711 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
1712 ; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1713 ; AVX512F-NEXT: vzeroupper
1714 ; AVX512F-NEXT: retq
1716 ; AVX512BW-LABEL: trunc_mul_v8i64_v8i16:
1717 ; AVX512BW: # %bb.0:
1718 ; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1
1719 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
1720 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1721 ; AVX512BW-NEXT: vzeroupper
1722 ; AVX512BW-NEXT: retq
1724 ; AVX512DQ-LABEL: trunc_mul_v8i64_v8i16:
1725 ; AVX512DQ: # %bb.0:
1726 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
1727 ; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0
1728 ; AVX512DQ-NEXT: vzeroupper
1729 ; AVX512DQ-NEXT: retq
1730 %1 = mul <8 x i64> %a0, %a1
1731 %2 = trunc <8 x i64> %1 to <8 x i16>
1735 define <8 x i16> @trunc_mul_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
1736 ; SSE-LABEL: trunc_mul_v8i32_v8i16:
1738 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
1739 ; SSE-NEXT: pmuludq %xmm2, %xmm0
1740 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1741 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1742 ; SSE-NEXT: pmuludq %xmm4, %xmm2
1743 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1744 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1745 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1746 ; SSE-NEXT: pmuludq %xmm3, %xmm1
1747 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1748 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1749 ; SSE-NEXT: pmuludq %xmm2, %xmm3
1750 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
1751 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1752 ; SSE-NEXT: pslld $16, %xmm1
1753 ; SSE-NEXT: psrad $16, %xmm1
1754 ; SSE-NEXT: pslld $16, %xmm0
1755 ; SSE-NEXT: psrad $16, %xmm0
1756 ; SSE-NEXT: packssdw %xmm1, %xmm0
1759 ; AVX1-LABEL: trunc_mul_v8i32_v8i16:
1761 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm2
1762 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1763 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1764 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1765 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1766 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1767 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
1768 ; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
1769 ; AVX1-NEXT: vzeroupper
1772 ; AVX2-LABEL: trunc_mul_v8i32_v8i16:
1774 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
1775 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
1776 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1777 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1778 ; AVX2-NEXT: vzeroupper
1781 ; AVX512-LABEL: trunc_mul_v8i32_v8i16:
1783 ; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0
1784 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
1785 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1786 ; AVX512-NEXT: vzeroupper
1788 %1 = mul <8 x i32> %a0, %a1
1789 %2 = trunc <8 x i32> %1 to <8 x i16>
1793 define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
1794 ; SSE-LABEL: trunc_mul_v16i64_v16i8:
1796 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm0
1797 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm1
1798 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm2
1799 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm3
1800 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm4
1801 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm5
1802 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm6
1803 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm7
1804 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1805 ; SSE-NEXT: pand %xmm8, %xmm7
1806 ; SSE-NEXT: pand %xmm8, %xmm6
1807 ; SSE-NEXT: packuswb %xmm7, %xmm6
1808 ; SSE-NEXT: pand %xmm8, %xmm5
1809 ; SSE-NEXT: pand %xmm8, %xmm4
1810 ; SSE-NEXT: packuswb %xmm5, %xmm4
1811 ; SSE-NEXT: packuswb %xmm6, %xmm4
1812 ; SSE-NEXT: pand %xmm8, %xmm3
1813 ; SSE-NEXT: pand %xmm8, %xmm2
1814 ; SSE-NEXT: packuswb %xmm3, %xmm2
1815 ; SSE-NEXT: pand %xmm8, %xmm1
1816 ; SSE-NEXT: pand %xmm8, %xmm0
1817 ; SSE-NEXT: packuswb %xmm1, %xmm0
1818 ; SSE-NEXT: packuswb %xmm2, %xmm0
1819 ; SSE-NEXT: packuswb %xmm4, %xmm0
1822 ; AVX1-LABEL: trunc_mul_v16i64_v16i8:
1824 ; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm8
1825 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
1826 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1827 ; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm0
1828 ; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm4
1829 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
1830 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1831 ; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm1
1832 ; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm5
1833 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6
1834 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
1835 ; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm2
1836 ; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm6
1837 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7
1838 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
1839 ; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm3
1840 ; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255]
1841 ; AVX1-NEXT: # xmm7 = mem[0,0]
1842 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
1843 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
1844 ; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3
1845 ; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
1846 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
1847 ; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2
1848 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1849 ; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1
1850 ; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3
1851 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
1852 ; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0
1853 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3
1854 ; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0
1855 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1856 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1857 ; AVX1-NEXT: vzeroupper
1860 ; AVX2-LABEL: trunc_mul_v16i64_v16i8:
1862 ; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm0
1863 ; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm1
1864 ; AVX2-NEXT: vpmuludq %ymm6, %ymm2, %ymm2
1865 ; AVX2-NEXT: vpmuludq %ymm7, %ymm3, %ymm3
1866 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1867 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
1868 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
1869 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
1870 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
1871 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
1872 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
1873 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
1874 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1875 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
1876 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1877 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1878 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1879 ; AVX2-NEXT: vzeroupper
1882 ; AVX512F-LABEL: trunc_mul_v16i64_v16i8:
1884 ; AVX512F-NEXT: vpmuludq %zmm2, %zmm0, %zmm0
1885 ; AVX512F-NEXT: vpmuludq %zmm3, %zmm1, %zmm1
1886 ; AVX512F-NEXT: vpmovqb %zmm1, %xmm1
1887 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
1888 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1889 ; AVX512F-NEXT: vzeroupper
1890 ; AVX512F-NEXT: retq
1892 ; AVX512BW-LABEL: trunc_mul_v16i64_v16i8:
1893 ; AVX512BW: # %bb.0:
1894 ; AVX512BW-NEXT: vpmuludq %zmm2, %zmm0, %zmm0
1895 ; AVX512BW-NEXT: vpmuludq %zmm3, %zmm1, %zmm1
1896 ; AVX512BW-NEXT: vpmovqb %zmm1, %xmm1
1897 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
1898 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1899 ; AVX512BW-NEXT: vzeroupper
1900 ; AVX512BW-NEXT: retq
1902 ; AVX512DQ-LABEL: trunc_mul_v16i64_v16i8:
1903 ; AVX512DQ: # %bb.0:
1904 ; AVX512DQ-NEXT: vpmullq %zmm2, %zmm0, %zmm0
1905 ; AVX512DQ-NEXT: vpmullq %zmm3, %zmm1, %zmm1
1906 ; AVX512DQ-NEXT: vpmovqb %zmm1, %xmm1
1907 ; AVX512DQ-NEXT: vpmovqb %zmm0, %xmm0
1908 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1909 ; AVX512DQ-NEXT: vzeroupper
1910 ; AVX512DQ-NEXT: retq
1911 %1 = mul <16 x i64> %a0, %a1
1912 %2 = trunc <16 x i64> %1 to <16 x i8>
1916 define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
1917 ; SSE-LABEL: trunc_mul_v16i32_v16i8:
1919 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
1920 ; SSE-NEXT: pmuludq %xmm4, %xmm0
1921 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1922 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
1923 ; SSE-NEXT: pmuludq %xmm8, %xmm4
1924 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1925 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
1926 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
1927 ; SSE-NEXT: pmuludq %xmm5, %xmm1
1928 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1929 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
1930 ; SSE-NEXT: pmuludq %xmm4, %xmm5
1931 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
1932 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
1933 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
1934 ; SSE-NEXT: pmuludq %xmm6, %xmm2
1935 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1936 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
1937 ; SSE-NEXT: pmuludq %xmm4, %xmm5
1938 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
1939 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
1940 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
1941 ; SSE-NEXT: pmuludq %xmm7, %xmm3
1942 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
1943 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
1944 ; SSE-NEXT: pmuludq %xmm4, %xmm5
1945 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
1946 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
1947 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1948 ; SSE-NEXT: pand %xmm4, %xmm3
1949 ; SSE-NEXT: pand %xmm4, %xmm2
1950 ; SSE-NEXT: packuswb %xmm3, %xmm2
1951 ; SSE-NEXT: pand %xmm4, %xmm1
1952 ; SSE-NEXT: pand %xmm4, %xmm0
1953 ; SSE-NEXT: packuswb %xmm1, %xmm0
1954 ; SSE-NEXT: packuswb %xmm2, %xmm0
1957 ; AVX1-LABEL: trunc_mul_v16i32_v16i8:
1959 ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm4
1960 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
1961 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1962 ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
1963 ; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm2
1964 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
1965 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1966 ; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1
1967 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255]
1968 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
1969 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
1970 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
1971 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1972 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2
1973 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
1974 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1975 ; AVX1-NEXT: vzeroupper
1978 ; AVX2-LABEL: trunc_mul_v16i32_v16i8:
1980 ; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0
1981 ; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1
1982 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1983 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
1984 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
1985 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
1986 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1987 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1988 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1989 ; AVX2-NEXT: vzeroupper
1992 ; AVX512-LABEL: trunc_mul_v16i32_v16i8:
1994 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
1995 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
1996 ; AVX512-NEXT: vzeroupper
1998 %1 = mul <16 x i32> %a0, %a1
1999 %2 = trunc <16 x i32> %1 to <16 x i8>
2003 define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
2004 ; SSE-LABEL: trunc_mul_v16i16_v16i8:
2006 ; SSE-NEXT: pmullw %xmm2, %xmm0
2007 ; SSE-NEXT: pmullw %xmm3, %xmm1
2008 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
2009 ; SSE-NEXT: pand %xmm2, %xmm1
2010 ; SSE-NEXT: pand %xmm2, %xmm0
2011 ; SSE-NEXT: packuswb %xmm1, %xmm0
2014 ; AVX1-LABEL: trunc_mul_v16i16_v16i8:
2016 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm2
2017 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2018 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2019 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
2020 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
2021 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
2022 ; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1
2023 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
2024 ; AVX1-NEXT: vzeroupper
2027 ; AVX2-LABEL: trunc_mul_v16i16_v16i8:
2029 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
2030 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2031 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2032 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2033 ; AVX2-NEXT: vzeroupper
2036 ; AVX512F-LABEL: trunc_mul_v16i16_v16i8:
2038 ; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0
2039 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2040 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
2041 ; AVX512F-NEXT: vzeroupper
2042 ; AVX512F-NEXT: retq
2044 ; AVX512BW-LABEL: trunc_mul_v16i16_v16i8:
2045 ; AVX512BW: # %bb.0:
2046 ; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0
2047 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
2048 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2049 ; AVX512BW-NEXT: vzeroupper
2050 ; AVX512BW-NEXT: retq
2052 ; AVX512DQ-LABEL: trunc_mul_v16i16_v16i8:
2053 ; AVX512DQ: # %bb.0:
2054 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
2055 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2056 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
2057 ; AVX512DQ-NEXT: vzeroupper
2058 ; AVX512DQ-NEXT: retq
2059 %1 = mul <16 x i16> %a0, %a1
2060 %2 = trunc <16 x i16> %1 to <16 x i8>
2064 define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
2065 ; SSE-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2067 ; SSE-NEXT: pxor %xmm3, %xmm3
2068 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2069 ; SSE-NEXT: pslld $16, %xmm2
2070 ; SSE-NEXT: psrad $16, %xmm2
2071 ; SSE-NEXT: pslld $16, %xmm1
2072 ; SSE-NEXT: psrad $16, %xmm1
2073 ; SSE-NEXT: packssdw %xmm2, %xmm1
2074 ; SSE-NEXT: pmullw %xmm1, %xmm0
2077 ; AVX1-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2079 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2080 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2081 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
2082 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2083 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
2084 ; AVX1-NEXT: vzeroupper
2087 ; AVX2-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2089 ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2090 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
2091 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2092 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
2093 ; AVX2-NEXT: vzeroupper
2096 ; AVX512-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2098 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
2099 ; AVX512-NEXT: vpmovdw %zmm1, %ymm1
2100 ; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2101 ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
2102 ; AVX512-NEXT: vzeroupper
2104 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2105 %2 = zext <8 x i8> %1 to <8 x i32>
2106 %3 = mul <8 x i32> %2, %a1
2107 %4 = trunc <8 x i32> %3 to <8 x i16>
2115 define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
2116 ; SSE-LABEL: trunc_mul_const_v4i64_v4i32:
2118 ; SSE-NEXT: xorps %xmm2, %xmm2
2119 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
2120 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2121 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
2122 ; SSE-NEXT: movaps %xmm2, %xmm0
2125 ; AVX1-LABEL: trunc_mul_const_v4i64_v4i32:
2127 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2128 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2129 ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2130 ; AVX1-NEXT: vzeroupper
2133 ; AVX2-SLOW-LABEL: trunc_mul_const_v4i64_v4i32:
2134 ; AVX2-SLOW: # %bb.0:
2135 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
2136 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2137 ; AVX2-SLOW-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2138 ; AVX2-SLOW-NEXT: vzeroupper
2139 ; AVX2-SLOW-NEXT: retq
2141 ; AVX2-FAST-ALL-LABEL: trunc_mul_const_v4i64_v4i32:
2142 ; AVX2-FAST-ALL: # %bb.0:
2143 ; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
2144 ; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
2145 ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
2146 ; AVX2-FAST-ALL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2147 ; AVX2-FAST-ALL-NEXT: vzeroupper
2148 ; AVX2-FAST-ALL-NEXT: retq
2150 ; AVX2-FAST-PERLANE-LABEL: trunc_mul_const_v4i64_v4i32:
2151 ; AVX2-FAST-PERLANE: # %bb.0:
2152 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
2153 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2154 ; AVX2-FAST-PERLANE-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2155 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
2156 ; AVX2-FAST-PERLANE-NEXT: retq
2158 ; AVX512-LABEL: trunc_mul_const_v4i64_v4i32:
2160 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
2161 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
2162 ; AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2163 ; AVX512-NEXT: vzeroupper
2165 %1 = mul <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
2166 %2 = trunc <4 x i64> %1 to <4 x i32>
2170 define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
2171 ; SSE-LABEL: trunc_mul_const_v8i64_v8i16:
2173 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
2174 ; SSE-NEXT: pslld $16, %xmm2
2175 ; SSE-NEXT: psrad $16, %xmm2
2176 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2177 ; SSE-NEXT: pslld $16, %xmm0
2178 ; SSE-NEXT: psrad $16, %xmm0
2179 ; SSE-NEXT: packssdw %xmm2, %xmm0
2180 ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2183 ; AVX1-LABEL: trunc_mul_const_v8i64_v8i16:
2185 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
2186 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
2187 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
2188 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
2189 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
2190 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2191 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
2192 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2193 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2194 ; AVX1-NEXT: vzeroupper
2197 ; AVX2-LABEL: trunc_mul_const_v8i64_v8i16:
2199 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
2200 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
2201 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
2202 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
2203 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2204 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2205 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2206 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2207 ; AVX2-NEXT: vzeroupper
2210 ; AVX512-LABEL: trunc_mul_const_v8i64_v8i16:
2212 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
2213 ; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2214 ; AVX512-NEXT: vzeroupper
2216 %1 = mul <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
2217 %2 = trunc <8 x i64> %1 to <8 x i16>
2221 define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
2222 ; SSE-LABEL: trunc_mul_const_v8i32_v8i16:
2224 ; SSE-NEXT: pslld $16, %xmm1
2225 ; SSE-NEXT: psrad $16, %xmm1
2226 ; SSE-NEXT: pslld $16, %xmm0
2227 ; SSE-NEXT: psrad $16, %xmm0
2228 ; SSE-NEXT: packssdw %xmm1, %xmm0
2229 ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2232 ; AVX1-LABEL: trunc_mul_const_v8i32_v8i16:
2234 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2235 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2236 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2237 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2238 ; AVX1-NEXT: vzeroupper
2241 ; AVX2-LABEL: trunc_mul_const_v8i32_v8i16:
2243 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
2244 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2245 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2246 ; AVX2-NEXT: vzeroupper
2249 ; AVX512-LABEL: trunc_mul_const_v8i32_v8i16:
2251 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
2252 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
2253 ; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2254 ; AVX512-NEXT: vzeroupper
2256 %1 = mul <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2257 %2 = trunc <8 x i32> %1 to <8 x i16>
2261 define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
2262 ; SSE-LABEL: trunc_mul_const_v16i64_v16i8:
2264 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2265 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
2266 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
2267 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
2268 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
2269 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
2270 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7
2271 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
2272 ; SSE-NEXT: pand %xmm8, %xmm7
2273 ; SSE-NEXT: pand %xmm8, %xmm6
2274 ; SSE-NEXT: packuswb %xmm7, %xmm6
2275 ; SSE-NEXT: pand %xmm8, %xmm5
2276 ; SSE-NEXT: pand %xmm8, %xmm4
2277 ; SSE-NEXT: packuswb %xmm5, %xmm4
2278 ; SSE-NEXT: packuswb %xmm6, %xmm4
2279 ; SSE-NEXT: pand %xmm8, %xmm3
2280 ; SSE-NEXT: pand %xmm8, %xmm2
2281 ; SSE-NEXT: packuswb %xmm3, %xmm2
2282 ; SSE-NEXT: pand %xmm8, %xmm1
2283 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2284 ; SSE-NEXT: packuswb %xmm1, %xmm0
2285 ; SSE-NEXT: packuswb %xmm2, %xmm0
2286 ; SSE-NEXT: packuswb %xmm4, %xmm0
2289 ; AVX1-LABEL: trunc_mul_const_v16i64_v16i8:
2291 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4
2292 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2293 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2294 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm5
2295 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2296 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2297 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm6
2298 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
2299 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2300 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm7
2301 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
2302 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
2303 ; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = [255,255]
2304 ; AVX1-NEXT: # xmm8 = mem[0,0]
2305 ; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3
2306 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7
2307 ; AVX1-NEXT: vpackusdw %xmm3, %xmm7, %xmm3
2308 ; AVX1-NEXT: vpand %xmm2, %xmm8, %xmm2
2309 ; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm6
2310 ; AVX1-NEXT: vpackusdw %xmm2, %xmm6, %xmm2
2311 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
2312 ; AVX1-NEXT: vpand %xmm1, %xmm8, %xmm1
2313 ; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm3
2314 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
2315 ; AVX1-NEXT: vpand %xmm0, %xmm8, %xmm0
2316 ; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3
2317 ; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0
2318 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2319 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2320 ; AVX1-NEXT: vzeroupper
2323 ; AVX2-LABEL: trunc_mul_const_v16i64_v16i8:
2325 ; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2326 ; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2327 ; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
2328 ; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
2329 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
2330 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
2331 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
2332 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
2333 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
2334 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
2335 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
2336 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
2337 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
2338 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
2339 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2340 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2341 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2342 ; AVX2-NEXT: vzeroupper
2345 ; AVX512F-LABEL: trunc_mul_const_v16i64_v16i8:
2347 ; AVX512F-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2348 ; AVX512F-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2349 ; AVX512F-NEXT: vpmovqb %zmm1, %xmm1
2350 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
2351 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2352 ; AVX512F-NEXT: vzeroupper
2353 ; AVX512F-NEXT: retq
2355 ; AVX512BW-LABEL: trunc_mul_const_v16i64_v16i8:
2356 ; AVX512BW: # %bb.0:
2357 ; AVX512BW-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2358 ; AVX512BW-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2359 ; AVX512BW-NEXT: vpmovqb %zmm1, %xmm1
2360 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
2361 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2362 ; AVX512BW-NEXT: vzeroupper
2363 ; AVX512BW-NEXT: retq
2365 ; AVX512DQ-LABEL: trunc_mul_const_v16i64_v16i8:
2366 ; AVX512DQ: # %bb.0:
2367 ; AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2368 ; AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2369 ; AVX512DQ-NEXT: vpmovqb %zmm1, %xmm1
2370 ; AVX512DQ-NEXT: vpmovqb %zmm0, %xmm0
2371 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2372 ; AVX512DQ-NEXT: vzeroupper
2373 ; AVX512DQ-NEXT: retq
2374 %1 = mul <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
2375 %2 = trunc <16 x i64> %1 to <16 x i8>
2379 define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
2380 ; SSE-LABEL: trunc_mul_const_v16i32_v16i8:
2382 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
2383 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2384 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2385 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
2386 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2387 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
2388 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
2389 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2390 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2391 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
2392 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2393 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
2394 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
2395 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
2396 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2397 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
2398 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2399 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
2400 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
2401 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
2402 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
2403 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
2404 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2405 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
2406 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
2407 ; SSE-NEXT: pand %xmm4, %xmm3
2408 ; SSE-NEXT: pand %xmm4, %xmm2
2409 ; SSE-NEXT: packuswb %xmm3, %xmm2
2410 ; SSE-NEXT: pand %xmm4, %xmm1
2411 ; SSE-NEXT: pand %xmm4, %xmm0
2412 ; SSE-NEXT: packuswb %xmm1, %xmm0
2413 ; SSE-NEXT: packuswb %xmm2, %xmm0
2416 ; AVX1-LABEL: trunc_mul_const_v16i32_v16i8:
2418 ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
2419 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2420 ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2421 ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
2422 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2423 ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2424 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255]
2425 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
2426 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
2427 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
2428 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
2429 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
2430 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
2431 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2432 ; AVX1-NEXT: vzeroupper
2435 ; AVX2-LABEL: trunc_mul_const_v16i32_v16i8:
2437 ; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2438 ; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2439 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
2440 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
2441 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
2442 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
2443 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2444 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2445 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2446 ; AVX2-NEXT: vzeroupper
2449 ; AVX512-LABEL: trunc_mul_const_v16i32_v16i8:
2451 ; AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2452 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
2453 ; AVX512-NEXT: vzeroupper
2455 %1 = mul <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2456 %2 = trunc <16 x i32> %1 to <16 x i8>
2460 define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
2461 ; SSE-LABEL: trunc_mul_const_v16i16_v16i8:
2463 ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2464 ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2465 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
2466 ; SSE-NEXT: pand %xmm2, %xmm1
2467 ; SSE-NEXT: pand %xmm2, %xmm0
2468 ; SSE-NEXT: packuswb %xmm1, %xmm0
2471 ; AVX1-LABEL: trunc_mul_const_v16i16_v16i8:
2473 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2474 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2475 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2476 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
2477 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
2478 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
2479 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
2480 ; AVX1-NEXT: vzeroupper
2483 ; AVX2-LABEL: trunc_mul_const_v16i16_v16i8:
2485 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2486 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2487 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2488 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2489 ; AVX2-NEXT: vzeroupper
2492 ; AVX512F-LABEL: trunc_mul_const_v16i16_v16i8:
2494 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2495 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2496 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
2497 ; AVX512F-NEXT: vzeroupper
2498 ; AVX512F-NEXT: retq
2500 ; AVX512BW-LABEL: trunc_mul_const_v16i16_v16i8:
2501 ; AVX512BW: # %bb.0:
2502 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2503 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
2504 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2505 ; AVX512BW-NEXT: vzeroupper
2506 ; AVX512BW-NEXT: retq
2508 ; AVX512DQ-LABEL: trunc_mul_const_v16i16_v16i8:
2509 ; AVX512DQ: # %bb.0:
2510 ; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2511 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2512 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
2513 ; AVX512DQ-NEXT: vzeroupper
2514 ; AVX512DQ-NEXT: retq
2515 %1 = mul <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
2516 %2 = trunc <16 x i16> %1 to <16 x i8>
2524 define <4 x i32> @trunc_and_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2525 ; SSE-LABEL: trunc_and_v4i64_v4i32:
2527 ; SSE-NEXT: andps %xmm3, %xmm1
2528 ; SSE-NEXT: andps %xmm2, %xmm0
2529 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2532 ; AVX1-LABEL: trunc_and_v4i64_v4i32:
2534 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
2535 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2536 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2537 ; AVX1-NEXT: vzeroupper
2540 ; AVX2-SLOW-LABEL: trunc_and_v4i64_v4i32:
2541 ; AVX2-SLOW: # %bb.0:
2542 ; AVX2-SLOW-NEXT: vandps %ymm1, %ymm0, %ymm0
2543 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
2544 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2545 ; AVX2-SLOW-NEXT: vzeroupper
2546 ; AVX2-SLOW-NEXT: retq
2548 ; AVX2-FAST-ALL-LABEL: trunc_and_v4i64_v4i32:
2549 ; AVX2-FAST-ALL: # %bb.0:
2550 ; AVX2-FAST-ALL-NEXT: vandps %ymm1, %ymm0, %ymm0
2551 ; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
2552 ; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
2553 ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
2554 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2555 ; AVX2-FAST-ALL-NEXT: vzeroupper
2556 ; AVX2-FAST-ALL-NEXT: retq
2558 ; AVX2-FAST-PERLANE-LABEL: trunc_and_v4i64_v4i32:
2559 ; AVX2-FAST-PERLANE: # %bb.0:
2560 ; AVX2-FAST-PERLANE-NEXT: vandps %ymm1, %ymm0, %ymm0
2561 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
2562 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2563 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
2564 ; AVX2-FAST-PERLANE-NEXT: retq
2566 ; AVX512-LABEL: trunc_and_v4i64_v4i32:
2568 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
2569 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
2570 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2571 ; AVX512-NEXT: vzeroupper
2573 %1 = and <4 x i64> %a0, %a1
2574 %2 = trunc <4 x i64> %1 to <4 x i32>
2578 define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
2579 ; SSE-LABEL: trunc_and_v8i64_v8i16:
2581 ; SSE-NEXT: andps %xmm5, %xmm1
2582 ; SSE-NEXT: andps %xmm4, %xmm0
2583 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2584 ; SSE-NEXT: andps %xmm7, %xmm3
2585 ; SSE-NEXT: andps %xmm6, %xmm2
2586 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
2587 ; SSE-NEXT: pslld $16, %xmm2
2588 ; SSE-NEXT: psrad $16, %xmm2
2589 ; SSE-NEXT: pslld $16, %xmm0
2590 ; SSE-NEXT: psrad $16, %xmm0
2591 ; SSE-NEXT: packssdw %xmm2, %xmm0
2594 ; AVX1-LABEL: trunc_and_v8i64_v8i16:
2596 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
2597 ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
2598 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
2599 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
2600 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
2601 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
2602 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
2603 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2604 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
2605 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2606 ; AVX1-NEXT: vzeroupper
2609 ; AVX2-LABEL: trunc_and_v8i64_v8i16:
2611 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
2612 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
2613 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
2614 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
2615 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
2616 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
2617 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2618 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2619 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2620 ; AVX2-NEXT: vzeroupper
2623 ; AVX512-LABEL: trunc_and_v8i64_v8i16:
2625 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
2626 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
2627 ; AVX512-NEXT: vzeroupper
2629 %1 = and <8 x i64> %a0, %a1
2630 %2 = trunc <8 x i64> %1 to <8 x i16>
2634 define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
2635 ; SSE-LABEL: trunc_and_v8i32_v8i16:
2637 ; SSE-NEXT: pand %xmm2, %xmm0
2638 ; SSE-NEXT: pand %xmm3, %xmm1
2639 ; SSE-NEXT: pslld $16, %xmm1
2640 ; SSE-NEXT: psrad $16, %xmm1
2641 ; SSE-NEXT: pslld $16, %xmm0
2642 ; SSE-NEXT: psrad $16, %xmm0
2643 ; SSE-NEXT: packssdw %xmm1, %xmm0
2646 ; AVX1-LABEL: trunc_and_v8i32_v8i16:
2648 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
2649 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2650 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2651 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2652 ; AVX1-NEXT: vzeroupper
2655 ; AVX2-LABEL: trunc_and_v8i32_v8i16:
2657 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
2658 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
2659 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2660 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2661 ; AVX2-NEXT: vzeroupper
2664 ; AVX512-LABEL: trunc_and_v8i32_v8i16:
2666 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
2667 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
2668 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2669 ; AVX512-NEXT: vzeroupper
2671 %1 = and <8 x i32> %a0, %a1
2672 %2 = trunc <8 x i32> %1 to <8 x i16>
2676 define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
2677 ; SSE-LABEL: trunc_and_v16i64_v16i8:
2679 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm0
2680 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm1
2681 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm2
2682 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm3
2683 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm4
2684 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm5
2685 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm6
2686 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm7
2687 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
2688 ; SSE-NEXT: pand %xmm8, %xmm7
2689 ; SSE-NEXT: pand %xmm8, %xmm6
2690 ; SSE-NEXT: packuswb %xmm7, %xmm6
2691 ; SSE-NEXT: pand %xmm8, %xmm5
2692 ; SSE-NEXT: pand %xmm8, %xmm4
2693 ; SSE-NEXT: packuswb %xmm5, %xmm4
2694 ; SSE-NEXT: packuswb %xmm6, %xmm4
2695 ; SSE-NEXT: pand %xmm8, %xmm3
2696 ; SSE-NEXT: pand %xmm8, %xmm2
2697 ; SSE-NEXT: packuswb %xmm3, %xmm2
2698 ; SSE-NEXT: pand %xmm8, %xmm1
2699 ; SSE-NEXT: pand %xmm8, %xmm0
2700 ; SSE-NEXT: packuswb %xmm1, %xmm0
2701 ; SSE-NEXT: packuswb %xmm2, %xmm0
2702 ; SSE-NEXT: packuswb %xmm4, %xmm0
2705 ; AVX1-LABEL: trunc_and_v16i64_v16i8:
2707 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
2708 ; AVX1-NEXT: vandps %ymm5, %ymm1, %ymm1
2709 ; AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2
2710 ; AVX1-NEXT: vandps %ymm7, %ymm3, %ymm3
2711 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255]
2712 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
2713 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
2714 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
2715 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
2716 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
2717 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
2718 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
2719 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
2720 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
2721 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
2722 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
2723 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
2724 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
2725 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2726 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2727 ; AVX1-NEXT: vzeroupper
2730 ; AVX2-LABEL: trunc_and_v16i64_v16i8:
2732 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
2733 ; AVX2-NEXT: vpand %ymm5, %ymm1, %ymm1
2734 ; AVX2-NEXT: vpand %ymm6, %ymm2, %ymm2
2735 ; AVX2-NEXT: vpand %ymm7, %ymm3, %ymm3
2736 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
2737 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
2738 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
2739 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
2740 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
2741 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
2742 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
2743 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
2744 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
2745 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
2746 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2747 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2748 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2749 ; AVX2-NEXT: vzeroupper
2752 ; AVX512-LABEL: trunc_and_v16i64_v16i8:
2754 ; AVX512-NEXT: vpandq %zmm2, %zmm0, %zmm0
2755 ; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1
2756 ; AVX512-NEXT: vpmovqb %zmm1, %xmm1
2757 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
2758 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2759 ; AVX512-NEXT: vzeroupper
2761 %1 = and <16 x i64> %a0, %a1
2762 %2 = trunc <16 x i64> %1 to <16 x i8>
2766 define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
2767 ; SSE-LABEL: trunc_and_v16i32_v16i8:
2769 ; SSE-NEXT: pand %xmm4, %xmm0
2770 ; SSE-NEXT: pand %xmm5, %xmm1
2771 ; SSE-NEXT: pand %xmm6, %xmm2
2772 ; SSE-NEXT: pand %xmm7, %xmm3
2773 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
2774 ; SSE-NEXT: pand %xmm4, %xmm3
2775 ; SSE-NEXT: pand %xmm4, %xmm2
2776 ; SSE-NEXT: packuswb %xmm3, %xmm2
2777 ; SSE-NEXT: pand %xmm4, %xmm1
2778 ; SSE-NEXT: pand %xmm4, %xmm0
2779 ; SSE-NEXT: packuswb %xmm1, %xmm0
2780 ; SSE-NEXT: packuswb %xmm2, %xmm0
2783 ; AVX1-LABEL: trunc_and_v16i32_v16i8:
2785 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
2786 ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
2787 ; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
2788 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
2789 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
2790 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
2791 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
2792 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2793 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
2794 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2795 ; AVX1-NEXT: vzeroupper
2798 ; AVX2-LABEL: trunc_and_v16i32_v16i8:
2800 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
2801 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
2802 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
2803 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
2804 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
2805 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
2806 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2807 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2808 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2809 ; AVX2-NEXT: vzeroupper
2812 ; AVX512-LABEL: trunc_and_v16i32_v16i8:
2814 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
2815 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
2816 ; AVX512-NEXT: vzeroupper
2818 %1 = and <16 x i32> %a0, %a1
2819 %2 = trunc <16 x i32> %1 to <16 x i8>
2823 define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
2824 ; SSE-LABEL: trunc_and_v16i16_v16i8:
2826 ; SSE-NEXT: pand %xmm2, %xmm0
2827 ; SSE-NEXT: pand %xmm3, %xmm1
2828 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
2829 ; SSE-NEXT: pand %xmm2, %xmm1
2830 ; SSE-NEXT: pand %xmm2, %xmm0
2831 ; SSE-NEXT: packuswb %xmm1, %xmm0
2834 ; AVX1-LABEL: trunc_and_v16i16_v16i8:
2836 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
2837 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2838 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2839 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2840 ; AVX1-NEXT: vzeroupper
2843 ; AVX2-LABEL: trunc_and_v16i16_v16i8:
2845 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
2846 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2847 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2848 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2849 ; AVX2-NEXT: vzeroupper
2852 ; AVX512F-LABEL: trunc_and_v16i16_v16i8:
2854 ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
2855 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2856 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
2857 ; AVX512F-NEXT: vzeroupper
2858 ; AVX512F-NEXT: retq
2860 ; AVX512BW-LABEL: trunc_and_v16i16_v16i8:
2861 ; AVX512BW: # %bb.0:
2862 ; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0
2863 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
2864 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2865 ; AVX512BW-NEXT: vzeroupper
2866 ; AVX512BW-NEXT: retq
2868 ; AVX512DQ-LABEL: trunc_and_v16i16_v16i8:
2869 ; AVX512DQ: # %bb.0:
2870 ; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0
2871 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2872 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
2873 ; AVX512DQ-NEXT: vzeroupper
2874 ; AVX512DQ-NEXT: retq
2875 %1 = and <16 x i16> %a0, %a1
2876 %2 = trunc <16 x i16> %1 to <16 x i8>
2884 define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
2885 ; SSE-LABEL: trunc_and_const_v4i64_v4i32:
2887 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2888 ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2891 ; AVX1-LABEL: trunc_and_const_v4i64_v4i32:
2893 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2894 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2895 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2896 ; AVX1-NEXT: vzeroupper
2899 ; AVX2-SLOW-LABEL: trunc_and_const_v4i64_v4i32:
2900 ; AVX2-SLOW: # %bb.0:
2901 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
2902 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2903 ; AVX2-SLOW-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2904 ; AVX2-SLOW-NEXT: vzeroupper
2905 ; AVX2-SLOW-NEXT: retq
2907 ; AVX2-FAST-ALL-LABEL: trunc_and_const_v4i64_v4i32:
2908 ; AVX2-FAST-ALL: # %bb.0:
2909 ; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
2910 ; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
2911 ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
2912 ; AVX2-FAST-ALL-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2913 ; AVX2-FAST-ALL-NEXT: vzeroupper
2914 ; AVX2-FAST-ALL-NEXT: retq
2916 ; AVX2-FAST-PERLANE-LABEL: trunc_and_const_v4i64_v4i32:
2917 ; AVX2-FAST-PERLANE: # %bb.0:
2918 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
2919 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2920 ; AVX2-FAST-PERLANE-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2921 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
2922 ; AVX2-FAST-PERLANE-NEXT: retq
2924 ; AVX512-LABEL: trunc_and_const_v4i64_v4i32:
2926 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
2927 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
2928 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2929 ; AVX512-NEXT: vzeroupper
2931 %1 = and <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
2932 %2 = trunc <4 x i64> %1 to <4 x i32>
2936 define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
2937 ; SSE-LABEL: trunc_and_const_v8i64_v8i16:
2939 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
2940 ; SSE-NEXT: pslld $16, %xmm2
2941 ; SSE-NEXT: psrad $16, %xmm2
2942 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2943 ; SSE-NEXT: pslld $16, %xmm0
2944 ; SSE-NEXT: psrad $16, %xmm0
2945 ; SSE-NEXT: packssdw %xmm2, %xmm0
2946 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2949 ; AVX1-LABEL: trunc_and_const_v8i64_v8i16:
2951 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
2952 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
2953 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
2954 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
2955 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
2956 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2957 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
2958 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2959 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2960 ; AVX1-NEXT: vzeroupper
2963 ; AVX2-LABEL: trunc_and_const_v8i64_v8i16:
2965 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
2966 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
2967 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
2968 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
2969 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2970 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2971 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2972 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2973 ; AVX2-NEXT: vzeroupper
2976 ; AVX512-LABEL: trunc_and_const_v8i64_v8i16:
2978 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
2979 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2980 ; AVX512-NEXT: vzeroupper
2982 %1 = and <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
2983 %2 = trunc <8 x i64> %1 to <8 x i16>
2987 define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
2988 ; SSE-LABEL: trunc_and_const_v8i32_v8i16:
2990 ; SSE-NEXT: pslld $16, %xmm1
2991 ; SSE-NEXT: psrad $16, %xmm1
2992 ; SSE-NEXT: pslld $16, %xmm0
2993 ; SSE-NEXT: psrad $16, %xmm0
2994 ; SSE-NEXT: packssdw %xmm1, %xmm0
2995 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2998 ; AVX1-LABEL: trunc_and_const_v8i32_v8i16:
3000 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3001 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3002 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3003 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3004 ; AVX1-NEXT: vzeroupper
3007 ; AVX2-LABEL: trunc_and_const_v8i32_v8i16:
3009 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
3010 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3011 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3012 ; AVX2-NEXT: vzeroupper
3015 ; AVX512-LABEL: trunc_and_const_v8i32_v8i16:
3017 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
3018 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
3019 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3020 ; AVX512-NEXT: vzeroupper
3022 %1 = and <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3023 %2 = trunc <8 x i32> %1 to <8 x i16>
3027 define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
3028 ; SSE-LABEL: trunc_and_const_v16i64_v16i8:
3030 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3031 ; SSE-NEXT: pand %xmm8, %xmm7
3032 ; SSE-NEXT: pand %xmm8, %xmm6
3033 ; SSE-NEXT: packuswb %xmm7, %xmm6
3034 ; SSE-NEXT: pand %xmm8, %xmm5
3035 ; SSE-NEXT: pand %xmm8, %xmm4
3036 ; SSE-NEXT: packuswb %xmm5, %xmm4
3037 ; SSE-NEXT: packuswb %xmm6, %xmm4
3038 ; SSE-NEXT: pand %xmm8, %xmm3
3039 ; SSE-NEXT: pand %xmm8, %xmm2
3040 ; SSE-NEXT: packuswb %xmm3, %xmm2
3041 ; SSE-NEXT: pand %xmm8, %xmm1
3042 ; SSE-NEXT: pand %xmm8, %xmm0
3043 ; SSE-NEXT: packuswb %xmm1, %xmm0
3044 ; SSE-NEXT: packuswb %xmm2, %xmm0
3045 ; SSE-NEXT: packuswb %xmm4, %xmm0
3046 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3049 ; AVX1-LABEL: trunc_and_const_v16i64_v16i8:
3051 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255]
3052 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
3053 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
3054 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
3055 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
3056 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
3057 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
3058 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
3059 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
3060 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3061 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3062 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
3063 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
3064 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
3065 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3066 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
3067 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3068 ; AVX1-NEXT: vzeroupper
3071 ; AVX2-LABEL: trunc_and_const_v16i64_v16i8:
3073 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3074 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
3075 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
3076 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
3077 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
3078 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
3079 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
3080 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
3081 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
3082 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
3083 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3084 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3085 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3086 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3087 ; AVX2-NEXT: vzeroupper
3090 ; AVX512-LABEL: trunc_and_const_v16i64_v16i8:
3092 ; AVX512-NEXT: vpmovqb %zmm1, %xmm1
3093 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
3094 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3095 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3096 ; AVX512-NEXT: vzeroupper
3098 %1 = and <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
3099 %2 = trunc <16 x i64> %1 to <16 x i8>
3103 define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
3104 ; SSE-LABEL: trunc_and_const_v16i32_v16i8:
3106 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3107 ; SSE-NEXT: pand %xmm4, %xmm3
3108 ; SSE-NEXT: pand %xmm4, %xmm2
3109 ; SSE-NEXT: packuswb %xmm3, %xmm2
3110 ; SSE-NEXT: pand %xmm4, %xmm1
3111 ; SSE-NEXT: pand %xmm4, %xmm0
3112 ; SSE-NEXT: packuswb %xmm1, %xmm0
3113 ; SSE-NEXT: packuswb %xmm2, %xmm0
3114 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3117 ; AVX1-LABEL: trunc_and_const_v16i32_v16i8:
3119 ; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
3120 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
3121 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3122 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3123 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
3124 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3125 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
3126 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3127 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3128 ; AVX1-NEXT: vzeroupper
3131 ; AVX2-LABEL: trunc_and_const_v16i32_v16i8:
3133 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3134 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
3135 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
3136 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
3137 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3138 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3139 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3140 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3141 ; AVX2-NEXT: vzeroupper
3144 ; AVX512-LABEL: trunc_and_const_v16i32_v16i8:
3146 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
3147 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3148 ; AVX512-NEXT: vzeroupper
3150 %1 = and <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3151 %2 = trunc <16 x i32> %1 to <16 x i8>
3155 define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
3156 ; SSE-LABEL: trunc_and_const_v16i16_v16i8:
3158 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
3159 ; SSE-NEXT: pand %xmm2, %xmm1
3160 ; SSE-NEXT: pand %xmm2, %xmm0
3161 ; SSE-NEXT: packuswb %xmm1, %xmm0
3162 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3165 ; AVX1-LABEL: trunc_and_const_v16i16_v16i8:
3167 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3168 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3169 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3170 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3171 ; AVX1-NEXT: vzeroupper
3174 ; AVX2-LABEL: trunc_and_const_v16i16_v16i8:
3176 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3177 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3178 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3179 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3180 ; AVX2-NEXT: vzeroupper
3183 ; AVX512F-LABEL: trunc_and_const_v16i16_v16i8:
3185 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3186 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
3187 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3188 ; AVX512F-NEXT: vzeroupper
3189 ; AVX512F-NEXT: retq
3191 ; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8:
3192 ; AVX512BW: # %bb.0:
3193 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
3194 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
3195 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3196 ; AVX512BW-NEXT: vzeroupper
3197 ; AVX512BW-NEXT: retq
3199 ; AVX512DQ-LABEL: trunc_and_const_v16i16_v16i8:
3200 ; AVX512DQ: # %bb.0:
3201 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3202 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
3203 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3204 ; AVX512DQ-NEXT: vzeroupper
3205 ; AVX512DQ-NEXT: retq
3206 %1 = and <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
3207 %2 = trunc <16 x i16> %1 to <16 x i8>
3215 define <4 x i32> @trunc_xor_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
3216 ; SSE-LABEL: trunc_xor_v4i64_v4i32:
3218 ; SSE-NEXT: xorps %xmm3, %xmm1
3219 ; SSE-NEXT: xorps %xmm2, %xmm0
3220 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3223 ; AVX1-LABEL: trunc_xor_v4i64_v4i32:
3225 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
3226 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3227 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3228 ; AVX1-NEXT: vzeroupper
3231 ; AVX2-SLOW-LABEL: trunc_xor_v4i64_v4i32:
3232 ; AVX2-SLOW: # %bb.0:
3233 ; AVX2-SLOW-NEXT: vxorps %ymm1, %ymm0, %ymm0
3234 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
3235 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3236 ; AVX2-SLOW-NEXT: vzeroupper
3237 ; AVX2-SLOW-NEXT: retq
3239 ; AVX2-FAST-ALL-LABEL: trunc_xor_v4i64_v4i32:
3240 ; AVX2-FAST-ALL: # %bb.0:
3241 ; AVX2-FAST-ALL-NEXT: vxorps %ymm1, %ymm0, %ymm0
3242 ; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
3243 ; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
3244 ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
3245 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3246 ; AVX2-FAST-ALL-NEXT: vzeroupper
3247 ; AVX2-FAST-ALL-NEXT: retq
3249 ; AVX2-FAST-PERLANE-LABEL: trunc_xor_v4i64_v4i32:
3250 ; AVX2-FAST-PERLANE: # %bb.0:
3251 ; AVX2-FAST-PERLANE-NEXT: vxorps %ymm1, %ymm0, %ymm0
3252 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
3253 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3254 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
3255 ; AVX2-FAST-PERLANE-NEXT: retq
3257 ; AVX512-LABEL: trunc_xor_v4i64_v4i32:
3259 ; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
3260 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
3261 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3262 ; AVX512-NEXT: vzeroupper
3264 %1 = xor <4 x i64> %a0, %a1
3265 %2 = trunc <4 x i64> %1 to <4 x i32>
3269 define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
3270 ; SSE-LABEL: trunc_xor_v8i64_v8i16:
3272 ; SSE-NEXT: xorps %xmm5, %xmm1
3273 ; SSE-NEXT: xorps %xmm4, %xmm0
3274 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3275 ; SSE-NEXT: xorps %xmm7, %xmm3
3276 ; SSE-NEXT: xorps %xmm6, %xmm2
3277 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
3278 ; SSE-NEXT: pslld $16, %xmm2
3279 ; SSE-NEXT: psrad $16, %xmm2
3280 ; SSE-NEXT: pslld $16, %xmm0
3281 ; SSE-NEXT: psrad $16, %xmm0
3282 ; SSE-NEXT: packssdw %xmm2, %xmm0
3285 ; AVX1-LABEL: trunc_xor_v8i64_v8i16:
3287 ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
3288 ; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1
3289 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
3290 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
3291 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3292 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3293 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
3294 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3295 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
3296 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3297 ; AVX1-NEXT: vzeroupper
3300 ; AVX2-LABEL: trunc_xor_v8i64_v8i16:
3302 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
3303 ; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
3304 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
3305 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
3306 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
3307 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
3308 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3309 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3310 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3311 ; AVX2-NEXT: vzeroupper
3314 ; AVX512-LABEL: trunc_xor_v8i64_v8i16:
3316 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
3317 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
3318 ; AVX512-NEXT: vzeroupper
3320 %1 = xor <8 x i64> %a0, %a1
3321 %2 = trunc <8 x i64> %1 to <8 x i16>
3325 define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
3326 ; SSE-LABEL: trunc_xor_v8i32_v8i16:
3328 ; SSE-NEXT: pxor %xmm2, %xmm0
3329 ; SSE-NEXT: pxor %xmm3, %xmm1
3330 ; SSE-NEXT: pslld $16, %xmm1
3331 ; SSE-NEXT: psrad $16, %xmm1
3332 ; SSE-NEXT: pslld $16, %xmm0
3333 ; SSE-NEXT: psrad $16, %xmm0
3334 ; SSE-NEXT: packssdw %xmm1, %xmm0
3337 ; AVX1-LABEL: trunc_xor_v8i32_v8i16:
3339 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
3340 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3341 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3342 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3343 ; AVX1-NEXT: vzeroupper
3346 ; AVX2-LABEL: trunc_xor_v8i32_v8i16:
3348 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
3349 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
3350 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3351 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3352 ; AVX2-NEXT: vzeroupper
3355 ; AVX512-LABEL: trunc_xor_v8i32_v8i16:
3357 ; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
3358 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
3359 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3360 ; AVX512-NEXT: vzeroupper
3362 %1 = xor <8 x i32> %a0, %a1
3363 %2 = trunc <8 x i32> %1 to <8 x i16>
3367 define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
3368 ; SSE-LABEL: trunc_xor_v16i64_v16i8:
3370 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm0
3371 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm1
3372 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm2
3373 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm3
3374 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm4
3375 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm5
3376 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm6
3377 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm7
3378 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3379 ; SSE-NEXT: pand %xmm8, %xmm7
3380 ; SSE-NEXT: pand %xmm8, %xmm6
3381 ; SSE-NEXT: packuswb %xmm7, %xmm6
3382 ; SSE-NEXT: pand %xmm8, %xmm5
3383 ; SSE-NEXT: pand %xmm8, %xmm4
3384 ; SSE-NEXT: packuswb %xmm5, %xmm4
3385 ; SSE-NEXT: packuswb %xmm6, %xmm4
3386 ; SSE-NEXT: pand %xmm8, %xmm3
3387 ; SSE-NEXT: pand %xmm8, %xmm2
3388 ; SSE-NEXT: packuswb %xmm3, %xmm2
3389 ; SSE-NEXT: pand %xmm8, %xmm1
3390 ; SSE-NEXT: pand %xmm8, %xmm0
3391 ; SSE-NEXT: packuswb %xmm1, %xmm0
3392 ; SSE-NEXT: packuswb %xmm2, %xmm0
3393 ; SSE-NEXT: packuswb %xmm4, %xmm0
3396 ; AVX1-LABEL: trunc_xor_v16i64_v16i8:
3398 ; AVX1-NEXT: vxorps %ymm4, %ymm0, %ymm0
3399 ; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1
3400 ; AVX1-NEXT: vxorps %ymm6, %ymm2, %ymm2
3401 ; AVX1-NEXT: vxorps %ymm7, %ymm3, %ymm3
3402 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255]
3403 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
3404 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
3405 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
3406 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
3407 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
3408 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
3409 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
3410 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
3411 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3412 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3413 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
3414 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
3415 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
3416 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3417 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
3418 ; AVX1-NEXT: vzeroupper
3421 ; AVX2-LABEL: trunc_xor_v16i64_v16i8:
3423 ; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
3424 ; AVX2-NEXT: vpxor %ymm5, %ymm1, %ymm1
3425 ; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2
3426 ; AVX2-NEXT: vpxor %ymm7, %ymm3, %ymm3
3427 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3428 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
3429 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
3430 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
3431 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
3432 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
3433 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
3434 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
3435 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
3436 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
3437 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3438 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3439 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3440 ; AVX2-NEXT: vzeroupper
3443 ; AVX512-LABEL: trunc_xor_v16i64_v16i8:
3445 ; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0
3446 ; AVX512-NEXT: vpxorq %zmm3, %zmm1, %zmm1
3447 ; AVX512-NEXT: vpmovqb %zmm1, %xmm1
3448 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
3449 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3450 ; AVX512-NEXT: vzeroupper
3452 %1 = xor <16 x i64> %a0, %a1
3453 %2 = trunc <16 x i64> %1 to <16 x i8>
3457 define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
3458 ; SSE-LABEL: trunc_xor_v16i32_v16i8:
3460 ; SSE-NEXT: pxor %xmm4, %xmm0
3461 ; SSE-NEXT: pxor %xmm5, %xmm1
3462 ; SSE-NEXT: pxor %xmm6, %xmm2
3463 ; SSE-NEXT: pxor %xmm7, %xmm3
3464 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3465 ; SSE-NEXT: pand %xmm4, %xmm3
3466 ; SSE-NEXT: pand %xmm4, %xmm2
3467 ; SSE-NEXT: packuswb %xmm3, %xmm2
3468 ; SSE-NEXT: pand %xmm4, %xmm1
3469 ; SSE-NEXT: pand %xmm4, %xmm0
3470 ; SSE-NEXT: packuswb %xmm1, %xmm0
3471 ; SSE-NEXT: packuswb %xmm2, %xmm0
3474 ; AVX1-LABEL: trunc_xor_v16i32_v16i8:
3476 ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
3477 ; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1
3478 ; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
3479 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
3480 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3481 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3482 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
3483 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3484 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
3485 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3486 ; AVX1-NEXT: vzeroupper
3489 ; AVX2-LABEL: trunc_xor_v16i32_v16i8:
3491 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
3492 ; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
3493 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3494 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
3495 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
3496 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
3497 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3498 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3499 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3500 ; AVX2-NEXT: vzeroupper
3503 ; AVX512-LABEL: trunc_xor_v16i32_v16i8:
3505 ; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0
3506 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
3507 ; AVX512-NEXT: vzeroupper
3509 %1 = xor <16 x i32> %a0, %a1
3510 %2 = trunc <16 x i32> %1 to <16 x i8>
3514 define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
3515 ; SSE-LABEL: trunc_xor_v16i16_v16i8:
3517 ; SSE-NEXT: pxor %xmm2, %xmm0
3518 ; SSE-NEXT: pxor %xmm3, %xmm1
3519 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
3520 ; SSE-NEXT: pand %xmm2, %xmm1
3521 ; SSE-NEXT: pand %xmm2, %xmm0
3522 ; SSE-NEXT: packuswb %xmm1, %xmm0
3525 ; AVX1-LABEL: trunc_xor_v16i16_v16i8:
3527 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
3528 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3529 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3530 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3531 ; AVX1-NEXT: vzeroupper
3534 ; AVX2-LABEL: trunc_xor_v16i16_v16i8:
3536 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
3537 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3538 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3539 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3540 ; AVX2-NEXT: vzeroupper
3543 ; AVX512F-LABEL: trunc_xor_v16i16_v16i8:
3545 ; AVX512F-NEXT: vpxor %ymm1, %ymm0, %ymm0
3546 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3547 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
3548 ; AVX512F-NEXT: vzeroupper
3549 ; AVX512F-NEXT: retq
3551 ; AVX512BW-LABEL: trunc_xor_v16i16_v16i8:
3552 ; AVX512BW: # %bb.0:
3553 ; AVX512BW-NEXT: vpxor %ymm1, %ymm0, %ymm0
3554 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
3555 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3556 ; AVX512BW-NEXT: vzeroupper
3557 ; AVX512BW-NEXT: retq
3559 ; AVX512DQ-LABEL: trunc_xor_v16i16_v16i8:
3560 ; AVX512DQ: # %bb.0:
3561 ; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0
3562 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3563 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
3564 ; AVX512DQ-NEXT: vzeroupper
3565 ; AVX512DQ-NEXT: retq
3566 %1 = xor <16 x i16> %a0, %a1
3567 %2 = trunc <16 x i16> %1 to <16 x i8>
3575 define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
3576 ; SSE-LABEL: trunc_xor_const_v4i64_v4i32:
3578 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3579 ; SSE-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3582 ; AVX1-LABEL: trunc_xor_const_v4i64_v4i32:
3584 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3585 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3586 ; AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3587 ; AVX1-NEXT: vzeroupper
3590 ; AVX2-SLOW-LABEL: trunc_xor_const_v4i64_v4i32:
3591 ; AVX2-SLOW: # %bb.0:
3592 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
3593 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3594 ; AVX2-SLOW-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3595 ; AVX2-SLOW-NEXT: vzeroupper
3596 ; AVX2-SLOW-NEXT: retq
3598 ; AVX2-FAST-ALL-LABEL: trunc_xor_const_v4i64_v4i32:
3599 ; AVX2-FAST-ALL: # %bb.0:
3600 ; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
3601 ; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
3602 ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
3603 ; AVX2-FAST-ALL-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3604 ; AVX2-FAST-ALL-NEXT: vzeroupper
3605 ; AVX2-FAST-ALL-NEXT: retq
3607 ; AVX2-FAST-PERLANE-LABEL: trunc_xor_const_v4i64_v4i32:
3608 ; AVX2-FAST-PERLANE: # %bb.0:
3609 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
3610 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3611 ; AVX2-FAST-PERLANE-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3612 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
3613 ; AVX2-FAST-PERLANE-NEXT: retq
3615 ; AVX512-LABEL: trunc_xor_const_v4i64_v4i32:
3617 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
3618 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
3619 ; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3620 ; AVX512-NEXT: vzeroupper
3622 %1 = xor <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
3623 %2 = trunc <4 x i64> %1 to <4 x i32>
3627 define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
3628 ; SSE-LABEL: trunc_xor_const_v8i64_v8i16:
3630 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
3631 ; SSE-NEXT: pslld $16, %xmm2
3632 ; SSE-NEXT: psrad $16, %xmm2
3633 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3634 ; SSE-NEXT: pslld $16, %xmm0
3635 ; SSE-NEXT: psrad $16, %xmm0
3636 ; SSE-NEXT: packssdw %xmm2, %xmm0
3637 ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3640 ; AVX1-LABEL: trunc_xor_const_v8i64_v8i16:
3642 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
3643 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
3644 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3645 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3646 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
3647 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3648 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
3649 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3650 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3651 ; AVX1-NEXT: vzeroupper
3654 ; AVX2-LABEL: trunc_xor_const_v8i64_v8i16:
3656 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
3657 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
3658 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
3659 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
3660 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3661 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3662 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3663 ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3664 ; AVX2-NEXT: vzeroupper
3667 ; AVX512-LABEL: trunc_xor_const_v8i64_v8i16:
3669 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
3670 ; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3671 ; AVX512-NEXT: vzeroupper
3673 %1 = xor <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
3674 %2 = trunc <8 x i64> %1 to <8 x i16>
3678 define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
3679 ; SSE-LABEL: trunc_xor_const_v8i32_v8i16:
3681 ; SSE-NEXT: pslld $16, %xmm1
3682 ; SSE-NEXT: psrad $16, %xmm1
3683 ; SSE-NEXT: pslld $16, %xmm0
3684 ; SSE-NEXT: psrad $16, %xmm0
3685 ; SSE-NEXT: packssdw %xmm1, %xmm0
3686 ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3689 ; AVX1-LABEL: trunc_xor_const_v8i32_v8i16:
3691 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3692 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3693 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3694 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3695 ; AVX1-NEXT: vzeroupper
3698 ; AVX2-LABEL: trunc_xor_const_v8i32_v8i16:
3700 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
3701 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3702 ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3703 ; AVX2-NEXT: vzeroupper
3706 ; AVX512-LABEL: trunc_xor_const_v8i32_v8i16:
3708 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
3709 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
3710 ; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3711 ; AVX512-NEXT: vzeroupper
3713 %1 = xor <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3714 %2 = trunc <8 x i32> %1 to <8 x i16>
3718 define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
3719 ; SSE-LABEL: trunc_xor_const_v16i64_v16i8:
3721 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3722 ; SSE-NEXT: pand %xmm8, %xmm7
3723 ; SSE-NEXT: pand %xmm8, %xmm6
3724 ; SSE-NEXT: packuswb %xmm7, %xmm6
3725 ; SSE-NEXT: pand %xmm8, %xmm5
3726 ; SSE-NEXT: pand %xmm8, %xmm4
3727 ; SSE-NEXT: packuswb %xmm5, %xmm4
3728 ; SSE-NEXT: packuswb %xmm6, %xmm4
3729 ; SSE-NEXT: pand %xmm8, %xmm3
3730 ; SSE-NEXT: pand %xmm8, %xmm2
3731 ; SSE-NEXT: packuswb %xmm3, %xmm2
3732 ; SSE-NEXT: pand %xmm8, %xmm1
3733 ; SSE-NEXT: pand %xmm8, %xmm0
3734 ; SSE-NEXT: packuswb %xmm1, %xmm0
3735 ; SSE-NEXT: packuswb %xmm2, %xmm0
3736 ; SSE-NEXT: packuswb %xmm4, %xmm0
3737 ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3740 ; AVX1-LABEL: trunc_xor_const_v16i64_v16i8:
3742 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255]
3743 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
3744 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
3745 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
3746 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
3747 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
3748 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
3749 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
3750 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
3751 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3752 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3753 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
3754 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
3755 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
3756 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3757 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
3758 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3759 ; AVX1-NEXT: vzeroupper
3762 ; AVX2-LABEL: trunc_xor_const_v16i64_v16i8:
3764 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3765 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
3766 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
3767 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
3768 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
3769 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
3770 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
3771 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
3772 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
3773 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
3774 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3775 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3776 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3777 ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3778 ; AVX2-NEXT: vzeroupper
3781 ; AVX512-LABEL: trunc_xor_const_v16i64_v16i8:
3783 ; AVX512-NEXT: vpmovqb %zmm1, %xmm1
3784 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
3785 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3786 ; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3787 ; AVX512-NEXT: vzeroupper
3789 %1 = xor <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
3790 %2 = trunc <16 x i64> %1 to <16 x i8>
3794 define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
3795 ; SSE-LABEL: trunc_xor_const_v16i32_v16i8:
3797 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3798 ; SSE-NEXT: pand %xmm4, %xmm3
3799 ; SSE-NEXT: pand %xmm4, %xmm2
3800 ; SSE-NEXT: packuswb %xmm3, %xmm2
3801 ; SSE-NEXT: pand %xmm4, %xmm1
3802 ; SSE-NEXT: pand %xmm4, %xmm0
3803 ; SSE-NEXT: packuswb %xmm1, %xmm0
3804 ; SSE-NEXT: packuswb %xmm2, %xmm0
3805 ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3808 ; AVX1-LABEL: trunc_xor_const_v16i32_v16i8:
3810 ; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
3811 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
3812 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3813 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3814 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
3815 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3816 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
3817 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3818 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3819 ; AVX1-NEXT: vzeroupper
3822 ; AVX2-LABEL: trunc_xor_const_v16i32_v16i8:
3824 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3825 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
3826 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
3827 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
3828 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3829 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3830 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3831 ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3832 ; AVX2-NEXT: vzeroupper
3835 ; AVX512-LABEL: trunc_xor_const_v16i32_v16i8:
3837 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
3838 ; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3839 ; AVX512-NEXT: vzeroupper
3841 %1 = xor <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3842 %2 = trunc <16 x i32> %1 to <16 x i8>
3846 define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
3847 ; SSE-LABEL: trunc_xor_const_v16i16_v16i8:
3849 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
3850 ; SSE-NEXT: pand %xmm2, %xmm1
3851 ; SSE-NEXT: pand %xmm2, %xmm0
3852 ; SSE-NEXT: packuswb %xmm1, %xmm0
3853 ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3856 ; AVX1-LABEL: trunc_xor_const_v16i16_v16i8:
3858 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3859 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3860 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3861 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3862 ; AVX1-NEXT: vzeroupper
3865 ; AVX2-LABEL: trunc_xor_const_v16i16_v16i8:
3867 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3868 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3869 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3870 ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3871 ; AVX2-NEXT: vzeroupper
3874 ; AVX512F-LABEL: trunc_xor_const_v16i16_v16i8:
3876 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3877 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
3878 ; AVX512F-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3879 ; AVX512F-NEXT: vzeroupper
3880 ; AVX512F-NEXT: retq
3882 ; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8:
3883 ; AVX512BW: # %bb.0:
3884 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
3885 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
3886 ; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3887 ; AVX512BW-NEXT: vzeroupper
3888 ; AVX512BW-NEXT: retq
3890 ; AVX512DQ-LABEL: trunc_xor_const_v16i16_v16i8:
3891 ; AVX512DQ: # %bb.0:
3892 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3893 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
3894 ; AVX512DQ-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3895 ; AVX512DQ-NEXT: vzeroupper
3896 ; AVX512DQ-NEXT: retq
3897 %1 = xor <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
3898 %2 = trunc <16 x i16> %1 to <16 x i8>
3906 define <4 x i32> @trunc_or_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
3907 ; SSE-LABEL: trunc_or_v4i64_v4i32:
3909 ; SSE-NEXT: orps %xmm3, %xmm1
3910 ; SSE-NEXT: orps %xmm2, %xmm0
3911 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3914 ; AVX1-LABEL: trunc_or_v4i64_v4i32:
3916 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
3917 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3918 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3919 ; AVX1-NEXT: vzeroupper
3922 ; AVX2-SLOW-LABEL: trunc_or_v4i64_v4i32:
3923 ; AVX2-SLOW: # %bb.0:
3924 ; AVX2-SLOW-NEXT: vorps %ymm1, %ymm0, %ymm0
3925 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
3926 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3927 ; AVX2-SLOW-NEXT: vzeroupper
3928 ; AVX2-SLOW-NEXT: retq
3930 ; AVX2-FAST-ALL-LABEL: trunc_or_v4i64_v4i32:
3931 ; AVX2-FAST-ALL: # %bb.0:
3932 ; AVX2-FAST-ALL-NEXT: vorps %ymm1, %ymm0, %ymm0
3933 ; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
3934 ; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
3935 ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
3936 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3937 ; AVX2-FAST-ALL-NEXT: vzeroupper
3938 ; AVX2-FAST-ALL-NEXT: retq
3940 ; AVX2-FAST-PERLANE-LABEL: trunc_or_v4i64_v4i32:
3941 ; AVX2-FAST-PERLANE: # %bb.0:
3942 ; AVX2-FAST-PERLANE-NEXT: vorps %ymm1, %ymm0, %ymm0
3943 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
3944 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3945 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
3946 ; AVX2-FAST-PERLANE-NEXT: retq
3948 ; AVX512-LABEL: trunc_or_v4i64_v4i32:
3950 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
3951 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
3952 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3953 ; AVX512-NEXT: vzeroupper
3955 %1 = or <4 x i64> %a0, %a1
3956 %2 = trunc <4 x i64> %1 to <4 x i32>
3960 define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
3961 ; SSE-LABEL: trunc_or_v8i64_v8i16:
3963 ; SSE-NEXT: orps %xmm5, %xmm1
3964 ; SSE-NEXT: orps %xmm4, %xmm0
3965 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3966 ; SSE-NEXT: orps %xmm7, %xmm3
3967 ; SSE-NEXT: orps %xmm6, %xmm2
3968 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
3969 ; SSE-NEXT: pslld $16, %xmm2
3970 ; SSE-NEXT: psrad $16, %xmm2
3971 ; SSE-NEXT: pslld $16, %xmm0
3972 ; SSE-NEXT: psrad $16, %xmm0
3973 ; SSE-NEXT: packssdw %xmm2, %xmm0
3976 ; AVX1-LABEL: trunc_or_v8i64_v8i16:
3978 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
3979 ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
3980 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
3981 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
3982 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3983 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3984 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
3985 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3986 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
3987 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3988 ; AVX1-NEXT: vzeroupper
3991 ; AVX2-LABEL: trunc_or_v8i64_v8i16:
3993 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
3994 ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
3995 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
3996 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
3997 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
3998 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
3999 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4000 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
4001 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4002 ; AVX2-NEXT: vzeroupper
4005 ; AVX512-LABEL: trunc_or_v8i64_v8i16:
4007 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
4008 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
4009 ; AVX512-NEXT: vzeroupper
4011 %1 = or <8 x i64> %a0, %a1
4012 %2 = trunc <8 x i64> %1 to <8 x i16>
4016 define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
4017 ; SSE-LABEL: trunc_or_v8i32_v8i16:
4019 ; SSE-NEXT: por %xmm2, %xmm0
4020 ; SSE-NEXT: por %xmm3, %xmm1
4021 ; SSE-NEXT: pslld $16, %xmm1
4022 ; SSE-NEXT: psrad $16, %xmm1
4023 ; SSE-NEXT: pslld $16, %xmm0
4024 ; SSE-NEXT: psrad $16, %xmm0
4025 ; SSE-NEXT: packssdw %xmm1, %xmm0
4028 ; AVX1-LABEL: trunc_or_v8i32_v8i16:
4030 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
4031 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4032 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4033 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
4034 ; AVX1-NEXT: vzeroupper
4037 ; AVX2-LABEL: trunc_or_v8i32_v8i16:
4039 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
4040 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
4041 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4042 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
4043 ; AVX2-NEXT: vzeroupper
4046 ; AVX512-LABEL: trunc_or_v8i32_v8i16:
4048 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
4049 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
4050 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
4051 ; AVX512-NEXT: vzeroupper
4053 %1 = or <8 x i32> %a0, %a1
4054 %2 = trunc <8 x i32> %1 to <8 x i16>
4058 define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
4059 ; SSE-LABEL: trunc_or_v16i64_v16i8:
4061 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm0
4062 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm1
4063 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm2
4064 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm3
4065 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm4
4066 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm5
4067 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm6
4068 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm7
4069 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4070 ; SSE-NEXT: pand %xmm8, %xmm7
4071 ; SSE-NEXT: pand %xmm8, %xmm6
4072 ; SSE-NEXT: packuswb %xmm7, %xmm6
4073 ; SSE-NEXT: pand %xmm8, %xmm5
4074 ; SSE-NEXT: pand %xmm8, %xmm4
4075 ; SSE-NEXT: packuswb %xmm5, %xmm4
4076 ; SSE-NEXT: packuswb %xmm6, %xmm4
4077 ; SSE-NEXT: pand %xmm8, %xmm3
4078 ; SSE-NEXT: pand %xmm8, %xmm2
4079 ; SSE-NEXT: packuswb %xmm3, %xmm2
4080 ; SSE-NEXT: pand %xmm8, %xmm1
4081 ; SSE-NEXT: pand %xmm8, %xmm0
4082 ; SSE-NEXT: packuswb %xmm1, %xmm0
4083 ; SSE-NEXT: packuswb %xmm2, %xmm0
4084 ; SSE-NEXT: packuswb %xmm4, %xmm0
4087 ; AVX1-LABEL: trunc_or_v16i64_v16i8:
4089 ; AVX1-NEXT: vorps %ymm4, %ymm0, %ymm0
4090 ; AVX1-NEXT: vorps %ymm5, %ymm1, %ymm1
4091 ; AVX1-NEXT: vorps %ymm6, %ymm2, %ymm2
4092 ; AVX1-NEXT: vorps %ymm7, %ymm3, %ymm3
4093 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255]
4094 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
4095 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
4096 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
4097 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
4098 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
4099 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
4100 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4101 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
4102 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
4103 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
4104 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
4105 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
4106 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
4107 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
4108 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
4109 ; AVX1-NEXT: vzeroupper
4112 ; AVX2-LABEL: trunc_or_v16i64_v16i8:
4114 ; AVX2-NEXT: vpor %ymm4, %ymm0, %ymm0
4115 ; AVX2-NEXT: vpor %ymm5, %ymm1, %ymm1
4116 ; AVX2-NEXT: vpor %ymm6, %ymm2, %ymm2
4117 ; AVX2-NEXT: vpor %ymm7, %ymm3, %ymm3
4118 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4119 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
4120 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
4121 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
4122 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
4123 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
4124 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
4125 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
4126 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
4127 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
4128 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4129 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4130 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4131 ; AVX2-NEXT: vzeroupper
4134 ; AVX512-LABEL: trunc_or_v16i64_v16i8:
4136 ; AVX512-NEXT: vporq %zmm2, %zmm0, %zmm0
4137 ; AVX512-NEXT: vporq %zmm3, %zmm1, %zmm1
4138 ; AVX512-NEXT: vpmovqb %zmm1, %xmm1
4139 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
4140 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4141 ; AVX512-NEXT: vzeroupper
4143 %1 = or <16 x i64> %a0, %a1
4144 %2 = trunc <16 x i64> %1 to <16 x i8>
4148 define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
4149 ; SSE-LABEL: trunc_or_v16i32_v16i8:
4151 ; SSE-NEXT: por %xmm4, %xmm0
4152 ; SSE-NEXT: por %xmm5, %xmm1
4153 ; SSE-NEXT: por %xmm6, %xmm2
4154 ; SSE-NEXT: por %xmm7, %xmm3
4155 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4156 ; SSE-NEXT: pand %xmm4, %xmm3
4157 ; SSE-NEXT: pand %xmm4, %xmm2
4158 ; SSE-NEXT: packuswb %xmm3, %xmm2
4159 ; SSE-NEXT: pand %xmm4, %xmm1
4160 ; SSE-NEXT: pand %xmm4, %xmm0
4161 ; SSE-NEXT: packuswb %xmm1, %xmm0
4162 ; SSE-NEXT: packuswb %xmm2, %xmm0
4165 ; AVX1-LABEL: trunc_or_v16i32_v16i8:
4167 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
4168 ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
4169 ; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
4170 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
4171 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
4172 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
4173 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
4174 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
4175 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
4176 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4177 ; AVX1-NEXT: vzeroupper
4180 ; AVX2-LABEL: trunc_or_v16i32_v16i8:
4182 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
4183 ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
4184 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4185 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
4186 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
4187 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
4188 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4189 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4190 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4191 ; AVX2-NEXT: vzeroupper
4194 ; AVX512-LABEL: trunc_or_v16i32_v16i8:
4196 ; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0
4197 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
4198 ; AVX512-NEXT: vzeroupper
4200 %1 = or <16 x i32> %a0, %a1
4201 %2 = trunc <16 x i32> %1 to <16 x i8>
4205 define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
4206 ; SSE-LABEL: trunc_or_v16i16_v16i8:
4208 ; SSE-NEXT: por %xmm2, %xmm0
4209 ; SSE-NEXT: por %xmm3, %xmm1
4210 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
4211 ; SSE-NEXT: pand %xmm2, %xmm1
4212 ; SSE-NEXT: pand %xmm2, %xmm0
4213 ; SSE-NEXT: packuswb %xmm1, %xmm0
4216 ; AVX1-LABEL: trunc_or_v16i16_v16i8:
4218 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
4219 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4220 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4221 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4222 ; AVX1-NEXT: vzeroupper
4225 ; AVX2-LABEL: trunc_or_v16i16_v16i8:
4227 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
4228 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4229 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4230 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4231 ; AVX2-NEXT: vzeroupper
4234 ; AVX512F-LABEL: trunc_or_v16i16_v16i8:
4236 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
4237 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4238 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
4239 ; AVX512F-NEXT: vzeroupper
4240 ; AVX512F-NEXT: retq
4242 ; AVX512BW-LABEL: trunc_or_v16i16_v16i8:
4243 ; AVX512BW: # %bb.0:
4244 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
4245 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
4246 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
4247 ; AVX512BW-NEXT: vzeroupper
4248 ; AVX512BW-NEXT: retq
4250 ; AVX512DQ-LABEL: trunc_or_v16i16_v16i8:
4251 ; AVX512DQ: # %bb.0:
4252 ; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0
4253 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4254 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
4255 ; AVX512DQ-NEXT: vzeroupper
4256 ; AVX512DQ-NEXT: retq
4257 %1 = or <16 x i16> %a0, %a1
4258 %2 = trunc <16 x i16> %1 to <16 x i8>
4266 define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
4267 ; SSE-LABEL: trunc_or_const_v4i64_v4i32:
4269 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4270 ; SSE-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4273 ; AVX1-LABEL: trunc_or_const_v4i64_v4i32:
4275 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4276 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4277 ; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4278 ; AVX1-NEXT: vzeroupper
4281 ; AVX2-SLOW-LABEL: trunc_or_const_v4i64_v4i32:
4282 ; AVX2-SLOW: # %bb.0:
4283 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
4284 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4285 ; AVX2-SLOW-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4286 ; AVX2-SLOW-NEXT: vzeroupper
4287 ; AVX2-SLOW-NEXT: retq
4289 ; AVX2-FAST-ALL-LABEL: trunc_or_const_v4i64_v4i32:
4290 ; AVX2-FAST-ALL: # %bb.0:
4291 ; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
4292 ; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
4293 ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
4294 ; AVX2-FAST-ALL-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4295 ; AVX2-FAST-ALL-NEXT: vzeroupper
4296 ; AVX2-FAST-ALL-NEXT: retq
4298 ; AVX2-FAST-PERLANE-LABEL: trunc_or_const_v4i64_v4i32:
4299 ; AVX2-FAST-PERLANE: # %bb.0:
4300 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
4301 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4302 ; AVX2-FAST-PERLANE-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4303 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
4304 ; AVX2-FAST-PERLANE-NEXT: retq
4306 ; AVX512-LABEL: trunc_or_const_v4i64_v4i32:
4308 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
4309 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
4310 ; AVX512-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4311 ; AVX512-NEXT: vzeroupper
4313 %1 = or <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
4314 %2 = trunc <4 x i64> %1 to <4 x i32>
4318 define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
4319 ; SSE-LABEL: trunc_or_const_v8i64_v8i16:
4321 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
4322 ; SSE-NEXT: pslld $16, %xmm2
4323 ; SSE-NEXT: psrad $16, %xmm2
4324 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4325 ; SSE-NEXT: pslld $16, %xmm0
4326 ; SSE-NEXT: psrad $16, %xmm0
4327 ; SSE-NEXT: packssdw %xmm2, %xmm0
4328 ; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4331 ; AVX1-LABEL: trunc_or_const_v8i64_v8i16:
4333 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
4334 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
4335 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
4336 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
4337 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
4338 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
4339 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
4340 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
4341 ; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4342 ; AVX1-NEXT: vzeroupper
4345 ; AVX2-LABEL: trunc_or_const_v8i64_v8i16:
4347 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
4348 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
4349 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
4350 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
4351 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4352 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
4353 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4354 ; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4355 ; AVX2-NEXT: vzeroupper
4358 ; AVX512-LABEL: trunc_or_const_v8i64_v8i16:
4360 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
4361 ; AVX512-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4362 ; AVX512-NEXT: vzeroupper
4364 %1 = or <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
4365 %2 = trunc <8 x i64> %1 to <8 x i16>
4369 define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
4370 ; SSE-LABEL: trunc_or_const_v8i32_v8i16:
4372 ; SSE-NEXT: pslld $16, %xmm1
4373 ; SSE-NEXT: psrad $16, %xmm1
4374 ; SSE-NEXT: pslld $16, %xmm0
4375 ; SSE-NEXT: psrad $16, %xmm0
4376 ; SSE-NEXT: packssdw %xmm1, %xmm0
4377 ; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4380 ; AVX1-LABEL: trunc_or_const_v8i32_v8i16:
4382 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4383 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4384 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
4385 ; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4386 ; AVX1-NEXT: vzeroupper
4389 ; AVX2-LABEL: trunc_or_const_v8i32_v8i16:
4391 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
4392 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4393 ; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4394 ; AVX2-NEXT: vzeroupper
4397 ; AVX512-LABEL: trunc_or_const_v8i32_v8i16:
4399 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
4400 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
4401 ; AVX512-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4402 ; AVX512-NEXT: vzeroupper
4404 %1 = or <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
4405 %2 = trunc <8 x i32> %1 to <8 x i16>
4409 define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
4410 ; SSE-LABEL: trunc_or_const_v16i64_v16i8:
4412 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4413 ; SSE-NEXT: pand %xmm8, %xmm7
4414 ; SSE-NEXT: pand %xmm8, %xmm6
4415 ; SSE-NEXT: packuswb %xmm7, %xmm6
4416 ; SSE-NEXT: pand %xmm8, %xmm5
4417 ; SSE-NEXT: pand %xmm8, %xmm4
4418 ; SSE-NEXT: packuswb %xmm5, %xmm4
4419 ; SSE-NEXT: packuswb %xmm6, %xmm4
4420 ; SSE-NEXT: pand %xmm8, %xmm3
4421 ; SSE-NEXT: pand %xmm8, %xmm2
4422 ; SSE-NEXT: packuswb %xmm3, %xmm2
4423 ; SSE-NEXT: pand %xmm8, %xmm1
4424 ; SSE-NEXT: pand %xmm8, %xmm0
4425 ; SSE-NEXT: packuswb %xmm1, %xmm0
4426 ; SSE-NEXT: packuswb %xmm2, %xmm0
4427 ; SSE-NEXT: packuswb %xmm4, %xmm0
4428 ; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4431 ; AVX1-LABEL: trunc_or_const_v16i64_v16i8:
4433 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255]
4434 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
4435 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
4436 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
4437 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
4438 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
4439 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
4440 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4441 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
4442 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
4443 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
4444 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
4445 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
4446 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
4447 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
4448 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
4449 ; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4450 ; AVX1-NEXT: vzeroupper
4453 ; AVX2-LABEL: trunc_or_const_v16i64_v16i8:
4455 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4456 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
4457 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
4458 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
4459 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
4460 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
4461 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
4462 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
4463 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
4464 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
4465 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4466 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4467 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4468 ; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4469 ; AVX2-NEXT: vzeroupper
4472 ; AVX512-LABEL: trunc_or_const_v16i64_v16i8:
4474 ; AVX512-NEXT: vpmovqb %zmm1, %xmm1
4475 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
4476 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4477 ; AVX512-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4478 ; AVX512-NEXT: vzeroupper
4480 %1 = or <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
4481 %2 = trunc <16 x i64> %1 to <16 x i8>
4485 define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
4486 ; SSE-LABEL: trunc_or_const_v16i32_v16i8:
4488 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4489 ; SSE-NEXT: pand %xmm4, %xmm3
4490 ; SSE-NEXT: pand %xmm4, %xmm2
4491 ; SSE-NEXT: packuswb %xmm3, %xmm2
4492 ; SSE-NEXT: pand %xmm4, %xmm1
4493 ; SSE-NEXT: pand %xmm4, %xmm0
4494 ; SSE-NEXT: packuswb %xmm1, %xmm0
4495 ; SSE-NEXT: packuswb %xmm2, %xmm0
4496 ; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4499 ; AVX1-LABEL: trunc_or_const_v16i32_v16i8:
4501 ; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
4502 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
4503 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
4504 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
4505 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
4506 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
4507 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
4508 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4509 ; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4510 ; AVX1-NEXT: vzeroupper
4513 ; AVX2-LABEL: trunc_or_const_v16i32_v16i8:
4515 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4516 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
4517 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
4518 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
4519 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4520 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4521 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4522 ; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4523 ; AVX2-NEXT: vzeroupper
4526 ; AVX512-LABEL: trunc_or_const_v16i32_v16i8:
4528 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
4529 ; AVX512-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4530 ; AVX512-NEXT: vzeroupper
4532 %1 = or <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
4533 %2 = trunc <16 x i32> %1 to <16 x i8>
4537 define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
4538 ; SSE-LABEL: trunc_or_const_v16i16_v16i8:
4540 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
4541 ; SSE-NEXT: pand %xmm2, %xmm1
4542 ; SSE-NEXT: pand %xmm2, %xmm0
4543 ; SSE-NEXT: packuswb %xmm1, %xmm0
4544 ; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4547 ; AVX1-LABEL: trunc_or_const_v16i16_v16i8:
4549 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4550 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4551 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4552 ; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4553 ; AVX1-NEXT: vzeroupper
4556 ; AVX2-LABEL: trunc_or_const_v16i16_v16i8:
4558 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4559 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4560 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4561 ; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4562 ; AVX2-NEXT: vzeroupper
4565 ; AVX512F-LABEL: trunc_or_const_v16i16_v16i8:
4567 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4568 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
4569 ; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4570 ; AVX512F-NEXT: vzeroupper
4571 ; AVX512F-NEXT: retq
4573 ; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8:
4574 ; AVX512BW: # %bb.0:
4575 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
4576 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
4577 ; AVX512BW-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4578 ; AVX512BW-NEXT: vzeroupper
4579 ; AVX512BW-NEXT: retq
4581 ; AVX512DQ-LABEL: trunc_or_const_v16i16_v16i8:
4582 ; AVX512DQ: # %bb.0:
4583 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4584 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
4585 ; AVX512DQ-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4586 ; AVX512DQ-NEXT: vzeroupper
4587 ; AVX512DQ-NEXT: retq
4588 %1 = or <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
4589 %2 = trunc <16 x i16> %1 to <16 x i8>
4594 ; complex patterns - often created by vectorizer
4597 define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
4598 ; SSE-LABEL: mul_add_const_v4i64_v4i32:
4600 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
4601 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
4602 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
4603 ; SSE-NEXT: pmuludq %xmm2, %xmm0
4604 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
4605 ; SSE-NEXT: pmuludq %xmm3, %xmm1
4606 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4607 ; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4610 ; AVX-LABEL: mul_add_const_v4i64_v4i32:
4612 ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
4613 ; AVX-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4615 %1 = sext <4 x i32> %a0 to <4 x i64>
4616 %2 = sext <4 x i32> %a1 to <4 x i64>
4617 %3 = mul <4 x i64> %1, %2
4618 %4 = add <4 x i64> %3, <i64 -3, i64 -1, i64 1, i64 3>
4619 %5 = trunc <4 x i64> %4 to <4 x i32>
4623 define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
4624 ; SSE-LABEL: mul_add_self_v4i64_v4i32:
4626 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
4627 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
4628 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
4629 ; SSE-NEXT: pmuludq %xmm2, %xmm0
4630 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
4631 ; SSE-NEXT: pmuludq %xmm3, %xmm1
4632 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4633 ; SSE-NEXT: paddd %xmm0, %xmm0
4636 ; AVX-LABEL: mul_add_self_v4i64_v4i32:
4638 ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
4639 ; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0
4641 %1 = sext <4 x i32> %a0 to <4 x i64>
4642 %2 = sext <4 x i32> %a1 to <4 x i64>
4643 %3 = mul <4 x i64> %1, %2
4644 %4 = add <4 x i64> %3, %3
4645 %5 = trunc <4 x i64> %4 to <4 x i32>
4649 define <4 x i32> @mul_add_multiuse_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
4650 ; SSE-LABEL: mul_add_multiuse_v4i64_v4i32:
4652 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
4653 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
4654 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3]
4655 ; SSE-NEXT: pmuludq %xmm2, %xmm4
4656 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
4657 ; SSE-NEXT: pmuludq %xmm3, %xmm1
4658 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[0,2]
4659 ; SSE-NEXT: paddd %xmm4, %xmm0
4662 ; AVX-LABEL: mul_add_multiuse_v4i64_v4i32:
4664 ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm1
4665 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
4667 %1 = sext <4 x i32> %a0 to <4 x i64>
4668 %2 = sext <4 x i32> %a1 to <4 x i64>
4669 %3 = mul <4 x i64> %1, %2
4670 %4 = add <4 x i64> %1, %3
4671 %5 = trunc <4 x i64> %4 to <4 x i32>