1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-SLOW
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-FAST
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512BW
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512DQ
14 define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
15 ; SSE-LABEL: trunc_add_v4i64_v4i32:
17 ; SSE-NEXT: paddq %xmm3, %xmm1
18 ; SSE-NEXT: paddq %xmm2, %xmm0
19 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
22 ; AVX1-LABEL: trunc_add_v4i64_v4i32:
24 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
25 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
26 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
27 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
28 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
29 ; AVX1-NEXT: vzeroupper
32 ; AVX2-SLOW-LABEL: trunc_add_v4i64_v4i32:
34 ; AVX2-SLOW-NEXT: vpaddq %ymm1, %ymm0, %ymm0
35 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
36 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
37 ; AVX2-SLOW-NEXT: vzeroupper
38 ; AVX2-SLOW-NEXT: retq
40 ; AVX2-FAST-LABEL: trunc_add_v4i64_v4i32:
42 ; AVX2-FAST-NEXT: vpaddq %ymm1, %ymm0, %ymm0
43 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
44 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
45 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
46 ; AVX2-FAST-NEXT: vzeroupper
47 ; AVX2-FAST-NEXT: retq
49 ; AVX512-LABEL: trunc_add_v4i64_v4i32:
51 ; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0
52 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
53 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
54 ; AVX512-NEXT: vzeroupper
56 %1 = add <4 x i64> %a0, %a1
57 %2 = trunc <4 x i64> %1 to <4 x i32>
61 define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
62 ; SSE-LABEL: trunc_add_v8i64_v8i16:
64 ; SSE-NEXT: paddq %xmm6, %xmm2
65 ; SSE-NEXT: paddq %xmm7, %xmm3
66 ; SSE-NEXT: paddq %xmm4, %xmm0
67 ; SSE-NEXT: paddq %xmm5, %xmm1
68 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
69 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
70 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
71 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
72 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
73 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
74 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
75 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
76 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
77 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
78 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
81 ; AVX1-LABEL: trunc_add_v8i64_v8i16:
83 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4
84 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
85 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
86 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
87 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm2
88 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
89 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
90 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
91 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
92 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
93 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
94 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
95 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
96 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
97 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
98 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
99 ; AVX1-NEXT: vzeroupper
102 ; AVX2-SLOW-LABEL: trunc_add_v8i64_v8i16:
103 ; AVX2-SLOW: # %bb.0:
104 ; AVX2-SLOW-NEXT: vpaddq %ymm2, %ymm0, %ymm0
105 ; AVX2-SLOW-NEXT: vpaddq %ymm3, %ymm1, %ymm1
106 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
107 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
108 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
109 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
110 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
111 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
112 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
113 ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
114 ; AVX2-SLOW-NEXT: vzeroupper
115 ; AVX2-SLOW-NEXT: retq
117 ; AVX2-FAST-LABEL: trunc_add_v8i64_v8i16:
118 ; AVX2-FAST: # %bb.0:
119 ; AVX2-FAST-NEXT: vpaddq %ymm3, %ymm1, %ymm1
120 ; AVX2-FAST-NEXT: vpaddq %ymm2, %ymm0, %ymm0
121 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
122 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
123 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
124 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
125 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
126 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
127 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
128 ; AVX2-FAST-NEXT: vzeroupper
129 ; AVX2-FAST-NEXT: retq
131 ; AVX512-LABEL: trunc_add_v8i64_v8i16:
133 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
134 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
135 ; AVX512-NEXT: vzeroupper
137 %1 = add <8 x i64> %a0, %a1
138 %2 = trunc <8 x i64> %1 to <8 x i16>
142 define <8 x i16> @trunc_add_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
143 ; SSE-LABEL: trunc_add_v8i32_v8i16:
145 ; SSE-NEXT: paddd %xmm2, %xmm0
146 ; SSE-NEXT: paddd %xmm3, %xmm1
147 ; SSE-NEXT: pslld $16, %xmm1
148 ; SSE-NEXT: psrad $16, %xmm1
149 ; SSE-NEXT: pslld $16, %xmm0
150 ; SSE-NEXT: psrad $16, %xmm0
151 ; SSE-NEXT: packssdw %xmm1, %xmm0
154 ; AVX1-LABEL: trunc_add_v8i32_v8i16:
156 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2
157 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
158 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
159 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
160 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
161 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
162 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
163 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
164 ; AVX1-NEXT: vzeroupper
167 ; AVX2-LABEL: trunc_add_v8i32_v8i16:
169 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
170 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
171 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
172 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
173 ; AVX2-NEXT: vzeroupper
176 ; AVX512-LABEL: trunc_add_v8i32_v8i16:
178 ; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
179 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
180 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
181 ; AVX512-NEXT: vzeroupper
183 %1 = add <8 x i32> %a0, %a1
184 %2 = trunc <8 x i32> %1 to <8 x i16>
188 define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
189 ; SSE-LABEL: trunc_add_v16i64_v16i8:
191 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm0
192 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm1
193 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm2
194 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm3
195 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm4
196 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm5
197 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm6
198 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm7
199 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
200 ; SSE-NEXT: pand %xmm8, %xmm7
201 ; SSE-NEXT: pand %xmm8, %xmm6
202 ; SSE-NEXT: packuswb %xmm7, %xmm6
203 ; SSE-NEXT: pand %xmm8, %xmm5
204 ; SSE-NEXT: pand %xmm8, %xmm4
205 ; SSE-NEXT: packuswb %xmm5, %xmm4
206 ; SSE-NEXT: packuswb %xmm6, %xmm4
207 ; SSE-NEXT: pand %xmm8, %xmm3
208 ; SSE-NEXT: pand %xmm8, %xmm2
209 ; SSE-NEXT: packuswb %xmm3, %xmm2
210 ; SSE-NEXT: pand %xmm8, %xmm1
211 ; SSE-NEXT: pand %xmm8, %xmm0
212 ; SSE-NEXT: packuswb %xmm1, %xmm0
213 ; SSE-NEXT: packuswb %xmm2, %xmm0
214 ; SSE-NEXT: packuswb %xmm4, %xmm0
217 ; AVX1-LABEL: trunc_add_v16i64_v16i8:
219 ; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm8
220 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
221 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
222 ; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0
223 ; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm4
224 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
225 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
226 ; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1
227 ; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm5
228 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6
229 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
230 ; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm2
231 ; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm6
232 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7
233 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
234 ; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm3
235 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255]
236 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
237 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
238 ; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3
239 ; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
240 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
241 ; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2
242 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
243 ; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1
244 ; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3
245 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
246 ; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0
247 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3
248 ; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0
249 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
250 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
251 ; AVX1-NEXT: vzeroupper
254 ; AVX2-SLOW-LABEL: trunc_add_v16i64_v16i8:
255 ; AVX2-SLOW: # %bb.0:
256 ; AVX2-SLOW-NEXT: vpaddq %ymm4, %ymm0, %ymm0
257 ; AVX2-SLOW-NEXT: vpaddq %ymm5, %ymm1, %ymm1
258 ; AVX2-SLOW-NEXT: vpaddq %ymm6, %ymm2, %ymm2
259 ; AVX2-SLOW-NEXT: vpaddq %ymm7, %ymm3, %ymm3
260 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4
261 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
262 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4
263 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
264 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
265 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
266 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
267 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
268 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
269 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
270 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5
271 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
272 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5
273 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
274 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
275 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
276 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
277 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
278 ; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
279 ; AVX2-SLOW-NEXT: vzeroupper
280 ; AVX2-SLOW-NEXT: retq
282 ; AVX2-FAST-LABEL: trunc_add_v16i64_v16i8:
283 ; AVX2-FAST: # %bb.0:
284 ; AVX2-FAST-NEXT: vpaddq %ymm5, %ymm1, %ymm1
285 ; AVX2-FAST-NEXT: vpaddq %ymm4, %ymm0, %ymm0
286 ; AVX2-FAST-NEXT: vpaddq %ymm7, %ymm3, %ymm3
287 ; AVX2-FAST-NEXT: vpaddq %ymm6, %ymm2, %ymm2
288 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
289 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
290 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
291 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
292 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
293 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
294 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
295 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
296 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2
297 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
298 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
299 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
300 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
301 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
302 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0
303 ; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
304 ; AVX2-FAST-NEXT: vzeroupper
305 ; AVX2-FAST-NEXT: retq
307 ; AVX512-LABEL: trunc_add_v16i64_v16i8:
309 ; AVX512-NEXT: vpaddq %zmm3, %zmm1, %zmm1
310 ; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0
311 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
312 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1
313 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
314 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
315 ; AVX512-NEXT: vzeroupper
317 %1 = add <16 x i64> %a0, %a1
318 %2 = trunc <16 x i64> %1 to <16 x i8>
322 define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
323 ; SSE-LABEL: trunc_add_v16i32_v16i8:
325 ; SSE-NEXT: paddd %xmm4, %xmm0
326 ; SSE-NEXT: paddd %xmm5, %xmm1
327 ; SSE-NEXT: paddd %xmm6, %xmm2
328 ; SSE-NEXT: paddd %xmm7, %xmm3
329 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
330 ; SSE-NEXT: pand %xmm4, %xmm3
331 ; SSE-NEXT: pand %xmm4, %xmm2
332 ; SSE-NEXT: packuswb %xmm3, %xmm2
333 ; SSE-NEXT: pand %xmm4, %xmm1
334 ; SSE-NEXT: pand %xmm4, %xmm0
335 ; SSE-NEXT: packuswb %xmm1, %xmm0
336 ; SSE-NEXT: packuswb %xmm2, %xmm0
339 ; AVX1-LABEL: trunc_add_v16i32_v16i8:
341 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4
342 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
343 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
344 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
345 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm2
346 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
347 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
348 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
349 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255]
350 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
351 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
352 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
353 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
354 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2
355 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
356 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
357 ; AVX1-NEXT: vzeroupper
360 ; AVX2-LABEL: trunc_add_v16i32_v16i8:
362 ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
363 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1
364 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
365 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
366 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
367 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
368 ; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
369 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
370 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
371 ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
372 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
373 ; AVX2-NEXT: vzeroupper
376 ; AVX512-LABEL: trunc_add_v16i32_v16i8:
378 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
379 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
380 ; AVX512-NEXT: vzeroupper
382 %1 = add <16 x i32> %a0, %a1
383 %2 = trunc <16 x i32> %1 to <16 x i8>
387 define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
388 ; SSE-LABEL: trunc_add_v16i16_v16i8:
390 ; SSE-NEXT: paddw %xmm2, %xmm0
391 ; SSE-NEXT: paddw %xmm3, %xmm1
392 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
393 ; SSE-NEXT: pand %xmm2, %xmm1
394 ; SSE-NEXT: pand %xmm2, %xmm0
395 ; SSE-NEXT: packuswb %xmm1, %xmm0
398 ; AVX1-LABEL: trunc_add_v16i16_v16i8:
400 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2
401 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
402 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
403 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
404 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
405 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
406 ; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1
407 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
408 ; AVX1-NEXT: vzeroupper
411 ; AVX2-LABEL: trunc_add_v16i16_v16i8:
413 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
414 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
415 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
416 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
417 ; AVX2-NEXT: vzeroupper
420 ; AVX512F-LABEL: trunc_add_v16i16_v16i8:
422 ; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm0
423 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
424 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
425 ; AVX512F-NEXT: vzeroupper
428 ; AVX512BW-LABEL: trunc_add_v16i16_v16i8:
430 ; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm0
431 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
432 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
433 ; AVX512BW-NEXT: vzeroupper
434 ; AVX512BW-NEXT: retq
436 ; AVX512DQ-LABEL: trunc_add_v16i16_v16i8:
438 ; AVX512DQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0
439 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
440 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
441 ; AVX512DQ-NEXT: vzeroupper
442 ; AVX512DQ-NEXT: retq
443 %1 = add <16 x i16> %a0, %a1
444 %2 = trunc <16 x i16> %1 to <16 x i8>
448 define <8 x i16> @trunc_add_v8i32_v8i16_sext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
449 ; SSE-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
451 ; SSE-NEXT: pslld $16, %xmm2
452 ; SSE-NEXT: psrad $16, %xmm2
453 ; SSE-NEXT: pslld $16, %xmm1
454 ; SSE-NEXT: psrad $16, %xmm1
455 ; SSE-NEXT: packssdw %xmm2, %xmm1
456 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
457 ; SSE-NEXT: psraw $8, %xmm0
458 ; SSE-NEXT: paddw %xmm1, %xmm0
461 ; AVX1-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
463 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
464 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
465 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
466 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
467 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
468 ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0
469 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
470 ; AVX1-NEXT: vzeroupper
473 ; AVX2-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
475 ; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0
476 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
477 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
478 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
479 ; AVX2-NEXT: vzeroupper
482 ; AVX512-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
484 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
485 ; AVX512-NEXT: vpmovdw %zmm1, %ymm1
486 ; AVX512-NEXT: vpmovsxbw %xmm0, %xmm0
487 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
488 ; AVX512-NEXT: vzeroupper
490 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
491 %2 = sext <8 x i8> %1 to <8 x i32>
492 %3 = add <8 x i32> %2, %a1
493 %4 = trunc <8 x i32> %3 to <8 x i16>
501 define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
502 ; SSE-LABEL: trunc_add_const_v4i64_v4i32:
504 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
505 ; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
508 ; AVX1-LABEL: trunc_add_const_v4i64_v4i32:
510 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
511 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
512 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
513 ; AVX1-NEXT: vzeroupper
516 ; AVX2-SLOW-LABEL: trunc_add_const_v4i64_v4i32:
517 ; AVX2-SLOW: # %bb.0:
518 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
519 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
520 ; AVX2-SLOW-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
521 ; AVX2-SLOW-NEXT: vzeroupper
522 ; AVX2-SLOW-NEXT: retq
524 ; AVX2-FAST-LABEL: trunc_add_const_v4i64_v4i32:
525 ; AVX2-FAST: # %bb.0:
526 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
527 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
528 ; AVX2-FAST-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
529 ; AVX2-FAST-NEXT: vzeroupper
530 ; AVX2-FAST-NEXT: retq
532 ; AVX512-LABEL: trunc_add_const_v4i64_v4i32:
534 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
535 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
536 ; AVX512-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
537 ; AVX512-NEXT: vzeroupper
539 %1 = add <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
540 %2 = trunc <4 x i64> %1 to <4 x i32>
544 define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
545 ; SSE-LABEL: trunc_add_const_v8i64_v8i16:
547 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
548 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
549 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
550 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
551 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
552 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
553 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
554 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
555 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
556 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
557 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
558 ; SSE-NEXT: paddw {{.*}}(%rip), %xmm0
561 ; AVX1-LABEL: trunc_add_const_v8i64_v8i16:
563 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
564 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
565 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
566 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
567 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
568 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
569 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
570 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
571 ; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
572 ; AVX1-NEXT: vzeroupper
575 ; AVX2-SLOW-LABEL: trunc_add_const_v8i64_v8i16:
576 ; AVX2-SLOW: # %bb.0:
577 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
578 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
579 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
580 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
581 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
582 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
583 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
584 ; AVX2-SLOW-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
585 ; AVX2-SLOW-NEXT: vzeroupper
586 ; AVX2-SLOW-NEXT: retq
588 ; AVX2-FAST-LABEL: trunc_add_const_v8i64_v8i16:
589 ; AVX2-FAST: # %bb.0:
590 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
591 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
592 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
593 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
594 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
595 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
596 ; AVX2-FAST-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
597 ; AVX2-FAST-NEXT: vzeroupper
598 ; AVX2-FAST-NEXT: retq
600 ; AVX512-LABEL: trunc_add_const_v8i64_v8i16:
602 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
603 ; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
604 ; AVX512-NEXT: vzeroupper
606 %1 = add <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
607 %2 = trunc <8 x i64> %1 to <8 x i16>
611 define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
612 ; SSE-LABEL: trunc_add_const_v8i32_v8i16:
614 ; SSE-NEXT: pslld $16, %xmm1
615 ; SSE-NEXT: psrad $16, %xmm1
616 ; SSE-NEXT: pslld $16, %xmm0
617 ; SSE-NEXT: psrad $16, %xmm0
618 ; SSE-NEXT: packssdw %xmm1, %xmm0
619 ; SSE-NEXT: paddw {{.*}}(%rip), %xmm0
622 ; AVX1-LABEL: trunc_add_const_v8i32_v8i16:
624 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
625 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
626 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
627 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
628 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
629 ; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
630 ; AVX1-NEXT: vzeroupper
633 ; AVX2-LABEL: trunc_add_const_v8i32_v8i16:
635 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
636 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
637 ; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
638 ; AVX2-NEXT: vzeroupper
641 ; AVX512-LABEL: trunc_add_const_v8i32_v8i16:
643 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
644 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
645 ; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
646 ; AVX512-NEXT: vzeroupper
648 %1 = add <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
649 %2 = trunc <8 x i32> %1 to <8 x i16>
653 define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
654 ; SSE-LABEL: trunc_add_const_v16i64_v16i8:
656 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
657 ; SSE-NEXT: pand %xmm8, %xmm7
658 ; SSE-NEXT: pand %xmm8, %xmm6
659 ; SSE-NEXT: packuswb %xmm7, %xmm6
660 ; SSE-NEXT: pand %xmm8, %xmm5
661 ; SSE-NEXT: pand %xmm8, %xmm4
662 ; SSE-NEXT: packuswb %xmm5, %xmm4
663 ; SSE-NEXT: packuswb %xmm6, %xmm4
664 ; SSE-NEXT: pand %xmm8, %xmm3
665 ; SSE-NEXT: pand %xmm8, %xmm2
666 ; SSE-NEXT: packuswb %xmm3, %xmm2
667 ; SSE-NEXT: pand %xmm8, %xmm1
668 ; SSE-NEXT: pand %xmm8, %xmm0
669 ; SSE-NEXT: packuswb %xmm1, %xmm0
670 ; SSE-NEXT: packuswb %xmm2, %xmm0
671 ; SSE-NEXT: packuswb %xmm4, %xmm0
672 ; SSE-NEXT: paddb {{.*}}(%rip), %xmm0
675 ; AVX1-LABEL: trunc_add_const_v16i64_v16i8:
677 ; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
678 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
679 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
680 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
681 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
682 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
683 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
684 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
685 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
686 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
687 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
688 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
689 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
690 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
691 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
692 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
693 ; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
694 ; AVX1-NEXT: vzeroupper
697 ; AVX2-SLOW-LABEL: trunc_add_const_v16i64_v16i8:
698 ; AVX2-SLOW: # %bb.0:
699 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
700 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
701 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
702 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
703 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
704 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
705 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
706 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
707 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
708 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
709 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
710 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
711 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
712 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
713 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
714 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
715 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
716 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
717 ; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
718 ; AVX2-SLOW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
719 ; AVX2-SLOW-NEXT: vzeroupper
720 ; AVX2-SLOW-NEXT: retq
722 ; AVX2-FAST-LABEL: trunc_add_const_v16i64_v16i8:
723 ; AVX2-FAST: # %bb.0:
724 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
725 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
726 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
727 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
728 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
729 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
730 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
731 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
732 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2
733 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
734 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
735 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
736 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
737 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
738 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0
739 ; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
740 ; AVX2-FAST-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
741 ; AVX2-FAST-NEXT: vzeroupper
742 ; AVX2-FAST-NEXT: retq
744 ; AVX512-LABEL: trunc_add_const_v16i64_v16i8:
746 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
747 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1
748 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
749 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
750 ; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
751 ; AVX512-NEXT: vzeroupper
753 %1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
754 %2 = trunc <16 x i64> %1 to <16 x i8>
758 define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
759 ; SSE-LABEL: trunc_add_const_v16i32_v16i8:
761 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
762 ; SSE-NEXT: pand %xmm4, %xmm3
763 ; SSE-NEXT: pand %xmm4, %xmm2
764 ; SSE-NEXT: packuswb %xmm3, %xmm2
765 ; SSE-NEXT: pand %xmm4, %xmm1
766 ; SSE-NEXT: pand %xmm4, %xmm0
767 ; SSE-NEXT: packuswb %xmm1, %xmm0
768 ; SSE-NEXT: packuswb %xmm2, %xmm0
769 ; SSE-NEXT: paddb {{.*}}(%rip), %xmm0
772 ; AVX1-LABEL: trunc_add_const_v16i32_v16i8:
774 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
775 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
776 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
777 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
778 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
779 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
780 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
781 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
782 ; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
783 ; AVX1-NEXT: vzeroupper
786 ; AVX2-LABEL: trunc_add_const_v16i32_v16i8:
788 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
789 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
790 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
791 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
792 ; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
793 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
794 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
795 ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
796 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
797 ; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
798 ; AVX2-NEXT: vzeroupper
801 ; AVX512-LABEL: trunc_add_const_v16i32_v16i8:
803 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
804 ; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
805 ; AVX512-NEXT: vzeroupper
807 %1 = add <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
808 %2 = trunc <16 x i32> %1 to <16 x i8>
812 define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
813 ; SSE-LABEL: trunc_add_const_v16i16_v16i8:
815 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
816 ; SSE-NEXT: pand %xmm2, %xmm1
817 ; SSE-NEXT: pand %xmm2, %xmm0
818 ; SSE-NEXT: packuswb %xmm1, %xmm0
819 ; SSE-NEXT: paddb {{.*}}(%rip), %xmm0
822 ; AVX1-LABEL: trunc_add_const_v16i16_v16i8:
824 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
825 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
826 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
827 ; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
828 ; AVX1-NEXT: vzeroupper
831 ; AVX2-LABEL: trunc_add_const_v16i16_v16i8:
833 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
834 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
835 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
836 ; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
837 ; AVX2-NEXT: vzeroupper
840 ; AVX512F-LABEL: trunc_add_const_v16i16_v16i8:
842 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
843 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
844 ; AVX512F-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
845 ; AVX512F-NEXT: vzeroupper
848 ; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8:
850 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
851 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
852 ; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
853 ; AVX512BW-NEXT: vzeroupper
854 ; AVX512BW-NEXT: retq
856 ; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8:
858 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
859 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
860 ; AVX512DQ-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
861 ; AVX512DQ-NEXT: vzeroupper
862 ; AVX512DQ-NEXT: retq
863 %1 = add <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
864 %2 = trunc <16 x i16> %1 to <16 x i8>
872 define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
873 ; SSE-LABEL: trunc_sub_v4i64_v4i32:
875 ; SSE-NEXT: psubq %xmm3, %xmm1
876 ; SSE-NEXT: psubq %xmm2, %xmm0
877 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
880 ; AVX1-LABEL: trunc_sub_v4i64_v4i32:
882 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
883 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
884 ; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
885 ; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
886 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
887 ; AVX1-NEXT: vzeroupper
890 ; AVX2-SLOW-LABEL: trunc_sub_v4i64_v4i32:
891 ; AVX2-SLOW: # %bb.0:
892 ; AVX2-SLOW-NEXT: vpsubq %ymm1, %ymm0, %ymm0
893 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
894 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
895 ; AVX2-SLOW-NEXT: vzeroupper
896 ; AVX2-SLOW-NEXT: retq
898 ; AVX2-FAST-LABEL: trunc_sub_v4i64_v4i32:
899 ; AVX2-FAST: # %bb.0:
900 ; AVX2-FAST-NEXT: vpsubq %ymm1, %ymm0, %ymm0
901 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
902 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
903 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
904 ; AVX2-FAST-NEXT: vzeroupper
905 ; AVX2-FAST-NEXT: retq
907 ; AVX512-LABEL: trunc_sub_v4i64_v4i32:
909 ; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0
910 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
911 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
912 ; AVX512-NEXT: vzeroupper
914 %1 = sub <4 x i64> %a0, %a1
915 %2 = trunc <4 x i64> %1 to <4 x i32>
919 define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
920 ; SSE-LABEL: trunc_sub_v8i64_v8i16:
922 ; SSE-NEXT: psubq %xmm6, %xmm2
923 ; SSE-NEXT: psubq %xmm7, %xmm3
924 ; SSE-NEXT: psubq %xmm4, %xmm0
925 ; SSE-NEXT: psubq %xmm5, %xmm1
926 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
927 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
928 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
929 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
930 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
931 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
932 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
933 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
934 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
935 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
936 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
939 ; AVX1-LABEL: trunc_sub_v8i64_v8i16:
941 ; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm4
942 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
943 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
944 ; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
945 ; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm2
946 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
947 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
948 ; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1
949 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
950 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
951 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
952 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
953 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
954 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
955 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
956 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
957 ; AVX1-NEXT: vzeroupper
960 ; AVX2-SLOW-LABEL: trunc_sub_v8i64_v8i16:
961 ; AVX2-SLOW: # %bb.0:
962 ; AVX2-SLOW-NEXT: vpsubq %ymm2, %ymm0, %ymm0
963 ; AVX2-SLOW-NEXT: vpsubq %ymm3, %ymm1, %ymm1
964 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
965 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
966 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
967 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
968 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
969 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
970 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
971 ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
972 ; AVX2-SLOW-NEXT: vzeroupper
973 ; AVX2-SLOW-NEXT: retq
975 ; AVX2-FAST-LABEL: trunc_sub_v8i64_v8i16:
976 ; AVX2-FAST: # %bb.0:
977 ; AVX2-FAST-NEXT: vpsubq %ymm3, %ymm1, %ymm1
978 ; AVX2-FAST-NEXT: vpsubq %ymm2, %ymm0, %ymm0
979 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
980 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
981 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
982 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
983 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
984 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
985 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
986 ; AVX2-FAST-NEXT: vzeroupper
987 ; AVX2-FAST-NEXT: retq
989 ; AVX512-LABEL: trunc_sub_v8i64_v8i16:
991 ; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0
992 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
993 ; AVX512-NEXT: vzeroupper
995 %1 = sub <8 x i64> %a0, %a1
996 %2 = trunc <8 x i64> %1 to <8 x i16>
1000 define <8 x i16> @trunc_sub_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
1001 ; SSE-LABEL: trunc_sub_v8i32_v8i16:
1003 ; SSE-NEXT: psubd %xmm2, %xmm0
1004 ; SSE-NEXT: psubd %xmm3, %xmm1
1005 ; SSE-NEXT: pslld $16, %xmm1
1006 ; SSE-NEXT: psrad $16, %xmm1
1007 ; SSE-NEXT: pslld $16, %xmm0
1008 ; SSE-NEXT: psrad $16, %xmm0
1009 ; SSE-NEXT: packssdw %xmm1, %xmm0
1012 ; AVX1-LABEL: trunc_sub_v8i32_v8i16:
1014 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2
1015 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1016 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1017 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
1018 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1019 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
1020 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
1021 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1022 ; AVX1-NEXT: vzeroupper
1025 ; AVX2-LABEL: trunc_sub_v8i32_v8i16:
1027 ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
1028 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1029 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1030 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1031 ; AVX2-NEXT: vzeroupper
1034 ; AVX512-LABEL: trunc_sub_v8i32_v8i16:
1036 ; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0
1037 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
1038 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1039 ; AVX512-NEXT: vzeroupper
1041 %1 = sub <8 x i32> %a0, %a1
1042 %2 = trunc <8 x i32> %1 to <8 x i16>
1046 define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
1047 ; SSE-LABEL: trunc_sub_v16i64_v16i8:
1049 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm0
1050 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm1
1051 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm2
1052 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm3
1053 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm4
1054 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm5
1055 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm6
1056 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm7
1057 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1058 ; SSE-NEXT: pand %xmm8, %xmm7
1059 ; SSE-NEXT: pand %xmm8, %xmm6
1060 ; SSE-NEXT: packuswb %xmm7, %xmm6
1061 ; SSE-NEXT: pand %xmm8, %xmm5
1062 ; SSE-NEXT: pand %xmm8, %xmm4
1063 ; SSE-NEXT: packuswb %xmm5, %xmm4
1064 ; SSE-NEXT: packuswb %xmm6, %xmm4
1065 ; SSE-NEXT: pand %xmm8, %xmm3
1066 ; SSE-NEXT: pand %xmm8, %xmm2
1067 ; SSE-NEXT: packuswb %xmm3, %xmm2
1068 ; SSE-NEXT: pand %xmm8, %xmm1
1069 ; SSE-NEXT: pand %xmm8, %xmm0
1070 ; SSE-NEXT: packuswb %xmm1, %xmm0
1071 ; SSE-NEXT: packuswb %xmm2, %xmm0
1072 ; SSE-NEXT: packuswb %xmm4, %xmm0
1075 ; AVX1-LABEL: trunc_sub_v16i64_v16i8:
1077 ; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm8
1078 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
1079 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1080 ; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm0
1081 ; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm4
1082 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
1083 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1084 ; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm1
1085 ; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm5
1086 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6
1087 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
1088 ; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm2
1089 ; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm6
1090 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7
1091 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
1092 ; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm3
1093 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255]
1094 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
1095 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
1096 ; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3
1097 ; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
1098 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
1099 ; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2
1100 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1101 ; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1
1102 ; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3
1103 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
1104 ; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0
1105 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3
1106 ; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0
1107 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1108 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1109 ; AVX1-NEXT: vzeroupper
1112 ; AVX2-SLOW-LABEL: trunc_sub_v16i64_v16i8:
1113 ; AVX2-SLOW: # %bb.0:
1114 ; AVX2-SLOW-NEXT: vpsubq %ymm4, %ymm0, %ymm0
1115 ; AVX2-SLOW-NEXT: vpsubq %ymm5, %ymm1, %ymm1
1116 ; AVX2-SLOW-NEXT: vpsubq %ymm6, %ymm2, %ymm2
1117 ; AVX2-SLOW-NEXT: vpsubq %ymm7, %ymm3, %ymm3
1118 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4
1119 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
1120 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4
1121 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
1122 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
1123 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1124 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
1125 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
1126 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1127 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
1128 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5
1129 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
1130 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5
1131 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
1132 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1133 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
1134 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1135 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
1136 ; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1137 ; AVX2-SLOW-NEXT: vzeroupper
1138 ; AVX2-SLOW-NEXT: retq
1140 ; AVX2-FAST-LABEL: trunc_sub_v16i64_v16i8:
1141 ; AVX2-FAST: # %bb.0:
1142 ; AVX2-FAST-NEXT: vpsubq %ymm5, %ymm1, %ymm1
1143 ; AVX2-FAST-NEXT: vpsubq %ymm4, %ymm0, %ymm0
1144 ; AVX2-FAST-NEXT: vpsubq %ymm7, %ymm3, %ymm3
1145 ; AVX2-FAST-NEXT: vpsubq %ymm6, %ymm2, %ymm2
1146 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
1147 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
1148 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
1149 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
1150 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1151 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
1152 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
1153 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
1154 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2
1155 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
1156 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
1157 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1158 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
1159 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1160 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0
1161 ; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1162 ; AVX2-FAST-NEXT: vzeroupper
1163 ; AVX2-FAST-NEXT: retq
1165 ; AVX512-LABEL: trunc_sub_v16i64_v16i8:
1167 ; AVX512-NEXT: vpsubq %zmm3, %zmm1, %zmm1
1168 ; AVX512-NEXT: vpsubq %zmm2, %zmm0, %zmm0
1169 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
1170 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1
1171 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1172 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
1173 ; AVX512-NEXT: vzeroupper
1175 %1 = sub <16 x i64> %a0, %a1
1176 %2 = trunc <16 x i64> %1 to <16 x i8>
1180 define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
1181 ; SSE-LABEL: trunc_sub_v16i32_v16i8:
1183 ; SSE-NEXT: psubd %xmm4, %xmm0
1184 ; SSE-NEXT: psubd %xmm5, %xmm1
1185 ; SSE-NEXT: psubd %xmm6, %xmm2
1186 ; SSE-NEXT: psubd %xmm7, %xmm3
1187 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1188 ; SSE-NEXT: pand %xmm4, %xmm3
1189 ; SSE-NEXT: pand %xmm4, %xmm2
1190 ; SSE-NEXT: packuswb %xmm3, %xmm2
1191 ; SSE-NEXT: pand %xmm4, %xmm1
1192 ; SSE-NEXT: pand %xmm4, %xmm0
1193 ; SSE-NEXT: packuswb %xmm1, %xmm0
1194 ; SSE-NEXT: packuswb %xmm2, %xmm0
1197 ; AVX1-LABEL: trunc_sub_v16i32_v16i8:
1199 ; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm4
1200 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
1201 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1202 ; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0
1203 ; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm2
1204 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
1205 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1206 ; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1
1207 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255]
1208 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
1209 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
1210 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
1211 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1212 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2
1213 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
1214 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1215 ; AVX1-NEXT: vzeroupper
1218 ; AVX2-LABEL: trunc_sub_v16i32_v16i8:
1220 ; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0
1221 ; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm1
1222 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1223 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
1224 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1225 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
1226 ; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
1227 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
1228 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1229 ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
1230 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1231 ; AVX2-NEXT: vzeroupper
1234 ; AVX512-LABEL: trunc_sub_v16i32_v16i8:
1236 ; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0
1237 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
1238 ; AVX512-NEXT: vzeroupper
1240 %1 = sub <16 x i32> %a0, %a1
1241 %2 = trunc <16 x i32> %1 to <16 x i8>
1245 define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
1246 ; SSE-LABEL: trunc_sub_v16i16_v16i8:
1248 ; SSE-NEXT: psubw %xmm2, %xmm0
1249 ; SSE-NEXT: psubw %xmm3, %xmm1
1250 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
1251 ; SSE-NEXT: pand %xmm2, %xmm1
1252 ; SSE-NEXT: pand %xmm2, %xmm0
1253 ; SSE-NEXT: packuswb %xmm1, %xmm0
1256 ; AVX1-LABEL: trunc_sub_v16i16_v16i8:
1258 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2
1259 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1260 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1261 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0
1262 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
1263 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
1264 ; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1
1265 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
1266 ; AVX1-NEXT: vzeroupper
1269 ; AVX2-LABEL: trunc_sub_v16i16_v16i8:
1271 ; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0
1272 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1273 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1274 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1275 ; AVX2-NEXT: vzeroupper
1278 ; AVX512F-LABEL: trunc_sub_v16i16_v16i8:
1280 ; AVX512F-NEXT: vpsubw %ymm1, %ymm0, %ymm0
1281 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1282 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
1283 ; AVX512F-NEXT: vzeroupper
1284 ; AVX512F-NEXT: retq
1286 ; AVX512BW-LABEL: trunc_sub_v16i16_v16i8:
1287 ; AVX512BW: # %bb.0:
1288 ; AVX512BW-NEXT: vpsubw %ymm1, %ymm0, %ymm0
1289 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1290 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1291 ; AVX512BW-NEXT: vzeroupper
1292 ; AVX512BW-NEXT: retq
1294 ; AVX512DQ-LABEL: trunc_sub_v16i16_v16i8:
1295 ; AVX512DQ: # %bb.0:
1296 ; AVX512DQ-NEXT: vpsubw %ymm1, %ymm0, %ymm0
1297 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1298 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
1299 ; AVX512DQ-NEXT: vzeroupper
1300 ; AVX512DQ-NEXT: retq
1301 %1 = sub <16 x i16> %a0, %a1
1302 %2 = trunc <16 x i16> %1 to <16 x i8>
1306 define <16 x i8> @trunc_ext_sub_v16i16_v16i8(<16 x i8> %x, <16 x i8> %y) {
1307 ; SSE-LABEL: trunc_ext_sub_v16i16_v16i8:
1309 ; SSE-NEXT: psubb %xmm1, %xmm0
1312 ; AVX-LABEL: trunc_ext_sub_v16i16_v16i8:
1314 ; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
1316 %a = zext <16 x i8> %x to <16 x i16>
1317 %b = zext <16 x i8> %y to <16 x i16>
1318 %c = sub <16 x i16> %a, %b
1319 %d = trunc <16 x i16> %c to <16 x i8>
1327 define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
1328 ; SSE-LABEL: trunc_sub_const_v4i64_v4i32:
1330 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1331 ; SSE-NEXT: psubd {{.*}}(%rip), %xmm0
1334 ; AVX1-LABEL: trunc_sub_const_v4i64_v4i32:
1336 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1337 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1338 ; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
1339 ; AVX1-NEXT: vzeroupper
1342 ; AVX2-SLOW-LABEL: trunc_sub_const_v4i64_v4i32:
1343 ; AVX2-SLOW: # %bb.0:
1344 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
1345 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1346 ; AVX2-SLOW-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
1347 ; AVX2-SLOW-NEXT: vzeroupper
1348 ; AVX2-SLOW-NEXT: retq
1350 ; AVX2-FAST-LABEL: trunc_sub_const_v4i64_v4i32:
1351 ; AVX2-FAST: # %bb.0:
1352 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
1353 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
1354 ; AVX2-FAST-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
1355 ; AVX2-FAST-NEXT: vzeroupper
1356 ; AVX2-FAST-NEXT: retq
1358 ; AVX512-LABEL: trunc_sub_const_v4i64_v4i32:
1360 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1361 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
1362 ; AVX512-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
1363 ; AVX512-NEXT: vzeroupper
1365 %1 = sub <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
1366 %2 = trunc <4 x i64> %1 to <4 x i32>
1370 define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
1371 ; SSE-LABEL: trunc_sub_const_v8i64_v8i16:
1373 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1374 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1375 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1376 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
1377 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1378 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
1379 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
1380 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
1381 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
1382 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1383 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
1384 ; SSE-NEXT: psubw {{.*}}(%rip), %xmm0
1387 ; AVX1-LABEL: trunc_sub_const_v8i64_v8i16:
1389 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
1390 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
1391 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1392 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
1393 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
1394 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1395 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
1396 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1397 ; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
1398 ; AVX1-NEXT: vzeroupper
1401 ; AVX2-SLOW-LABEL: trunc_sub_const_v8i64_v8i16:
1402 ; AVX2-SLOW: # %bb.0:
1403 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
1404 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
1405 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
1406 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
1407 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1408 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1409 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1410 ; AVX2-SLOW-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
1411 ; AVX2-SLOW-NEXT: vzeroupper
1412 ; AVX2-SLOW-NEXT: retq
1414 ; AVX2-FAST-LABEL: trunc_sub_const_v8i64_v8i16:
1415 ; AVX2-FAST: # %bb.0:
1416 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
1417 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
1418 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
1419 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1420 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1421 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1422 ; AVX2-FAST-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
1423 ; AVX2-FAST-NEXT: vzeroupper
1424 ; AVX2-FAST-NEXT: retq
1426 ; AVX512-LABEL: trunc_sub_const_v8i64_v8i16:
1428 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
1429 ; AVX512-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
1430 ; AVX512-NEXT: vzeroupper
1432 %1 = sub <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
1433 %2 = trunc <8 x i64> %1 to <8 x i16>
1437 define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
1438 ; SSE-LABEL: trunc_sub_const_v8i32_v8i16:
1440 ; SSE-NEXT: pslld $16, %xmm1
1441 ; SSE-NEXT: psrad $16, %xmm1
1442 ; SSE-NEXT: pslld $16, %xmm0
1443 ; SSE-NEXT: psrad $16, %xmm0
1444 ; SSE-NEXT: packssdw %xmm1, %xmm0
1445 ; SSE-NEXT: psubw {{.*}}(%rip), %xmm0
1448 ; AVX1-LABEL: trunc_sub_const_v8i32_v8i16:
1450 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1451 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1452 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1453 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1454 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1455 ; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
1456 ; AVX1-NEXT: vzeroupper
1459 ; AVX2-LABEL: trunc_sub_const_v8i32_v8i16:
1461 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1462 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1463 ; AVX2-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
1464 ; AVX2-NEXT: vzeroupper
1467 ; AVX512-LABEL: trunc_sub_const_v8i32_v8i16:
1469 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1470 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
1471 ; AVX512-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
1472 ; AVX512-NEXT: vzeroupper
1474 %1 = sub <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1475 %2 = trunc <8 x i32> %1 to <8 x i16>
1479 define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
1480 ; SSE-LABEL: trunc_sub_const_v16i64_v16i8:
1482 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1483 ; SSE-NEXT: pand %xmm8, %xmm7
1484 ; SSE-NEXT: pand %xmm8, %xmm6
1485 ; SSE-NEXT: packuswb %xmm7, %xmm6
1486 ; SSE-NEXT: pand %xmm8, %xmm5
1487 ; SSE-NEXT: pand %xmm8, %xmm4
1488 ; SSE-NEXT: packuswb %xmm5, %xmm4
1489 ; SSE-NEXT: packuswb %xmm6, %xmm4
1490 ; SSE-NEXT: pand %xmm8, %xmm3
1491 ; SSE-NEXT: pand %xmm8, %xmm2
1492 ; SSE-NEXT: packuswb %xmm3, %xmm2
1493 ; SSE-NEXT: pand %xmm8, %xmm1
1494 ; SSE-NEXT: pand %xmm8, %xmm0
1495 ; SSE-NEXT: packuswb %xmm1, %xmm0
1496 ; SSE-NEXT: packuswb %xmm2, %xmm0
1497 ; SSE-NEXT: packuswb %xmm4, %xmm0
1498 ; SSE-NEXT: psubb {{.*}}(%rip), %xmm0
1501 ; AVX1-LABEL: trunc_sub_const_v16i64_v16i8:
1503 ; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
1504 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
1505 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
1506 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
1507 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
1508 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
1509 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
1510 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1511 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
1512 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1513 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
1514 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
1515 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1516 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
1517 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1518 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1519 ; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
1520 ; AVX1-NEXT: vzeroupper
1523 ; AVX2-SLOW-LABEL: trunc_sub_const_v16i64_v16i8:
1524 ; AVX2-SLOW: # %bb.0:
1525 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
1526 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
1527 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
1528 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
1529 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
1530 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1531 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
1532 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
1533 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1534 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
1535 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
1536 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
1537 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
1538 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
1539 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1540 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
1541 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1542 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
1543 ; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1544 ; AVX2-SLOW-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
1545 ; AVX2-SLOW-NEXT: vzeroupper
1546 ; AVX2-SLOW-NEXT: retq
1548 ; AVX2-FAST-LABEL: trunc_sub_const_v16i64_v16i8:
1549 ; AVX2-FAST: # %bb.0:
1550 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
1551 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
1552 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
1553 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
1554 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1555 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
1556 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
1557 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
1558 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2
1559 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
1560 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
1561 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1562 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
1563 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1564 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0
1565 ; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1566 ; AVX2-FAST-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
1567 ; AVX2-FAST-NEXT: vzeroupper
1568 ; AVX2-FAST-NEXT: retq
1570 ; AVX512-LABEL: trunc_sub_const_v16i64_v16i8:
1572 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
1573 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1
1574 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1575 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
1576 ; AVX512-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
1577 ; AVX512-NEXT: vzeroupper
1579 %1 = sub <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
1580 %2 = trunc <16 x i64> %1 to <16 x i8>
1584 define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
1585 ; SSE-LABEL: trunc_sub_const_v16i32_v16i8:
1587 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1588 ; SSE-NEXT: pand %xmm4, %xmm3
1589 ; SSE-NEXT: pand %xmm4, %xmm2
1590 ; SSE-NEXT: packuswb %xmm3, %xmm2
1591 ; SSE-NEXT: pand %xmm4, %xmm1
1592 ; SSE-NEXT: pand %xmm4, %xmm0
1593 ; SSE-NEXT: packuswb %xmm1, %xmm0
1594 ; SSE-NEXT: packuswb %xmm2, %xmm0
1595 ; SSE-NEXT: psubb {{.*}}(%rip), %xmm0
1598 ; AVX1-LABEL: trunc_sub_const_v16i32_v16i8:
1600 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
1601 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
1602 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1603 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
1604 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
1605 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1606 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
1607 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1608 ; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
1609 ; AVX1-NEXT: vzeroupper
1612 ; AVX2-LABEL: trunc_sub_const_v16i32_v16i8:
1614 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1615 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
1616 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1617 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
1618 ; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
1619 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
1620 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1621 ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
1622 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1623 ; AVX2-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
1624 ; AVX2-NEXT: vzeroupper
1627 ; AVX512-LABEL: trunc_sub_const_v16i32_v16i8:
1629 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
1630 ; AVX512-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
1631 ; AVX512-NEXT: vzeroupper
1633 %1 = sub <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1634 %2 = trunc <16 x i32> %1 to <16 x i8>
1638 define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
1639 ; SSE-LABEL: trunc_sub_const_v16i16_v16i8:
1641 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
1642 ; SSE-NEXT: pand %xmm2, %xmm1
1643 ; SSE-NEXT: pand %xmm2, %xmm0
1644 ; SSE-NEXT: packuswb %xmm1, %xmm0
1645 ; SSE-NEXT: psubb {{.*}}(%rip), %xmm0
1648 ; AVX1-LABEL: trunc_sub_const_v16i16_v16i8:
1650 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
1651 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1652 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1653 ; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
1654 ; AVX1-NEXT: vzeroupper
1657 ; AVX2-LABEL: trunc_sub_const_v16i16_v16i8:
1659 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1660 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1661 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1662 ; AVX2-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
1663 ; AVX2-NEXT: vzeroupper
1666 ; AVX512F-LABEL: trunc_sub_const_v16i16_v16i8:
1668 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1669 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
1670 ; AVX512F-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
1671 ; AVX512F-NEXT: vzeroupper
1672 ; AVX512F-NEXT: retq
1674 ; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8:
1675 ; AVX512BW: # %bb.0:
1676 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1677 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1678 ; AVX512BW-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
1679 ; AVX512BW-NEXT: vzeroupper
1680 ; AVX512BW-NEXT: retq
1682 ; AVX512DQ-LABEL: trunc_sub_const_v16i16_v16i8:
1683 ; AVX512DQ: # %bb.0:
1684 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1685 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
1686 ; AVX512DQ-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
1687 ; AVX512DQ-NEXT: vzeroupper
1688 ; AVX512DQ-NEXT: retq
1689 %1 = sub <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1690 %2 = trunc <16 x i16> %1 to <16 x i8>
1694 define <16 x i8> @trunc_ext_sub_const_rhs_v16i16_v16i8(<16 x i8> %x) {
1695 ; SSE-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
1697 ; SSE-NEXT: psubb {{.*}}(%rip), %xmm0
1700 ; AVX-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
1702 ; AVX-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
1704 %a = zext <16 x i8> %x to <16 x i16>
1705 %b = sub <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1706 %c = trunc <16 x i16> %b to <16 x i8>
1710 define <16 x i8> @trunc_ext_sub_const_lhs_v16i16_v16i8(<16 x i8> %x) {
1711 ; SSE-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
1713 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1714 ; SSE-NEXT: psubb %xmm0, %xmm1
1715 ; SSE-NEXT: movdqa %xmm1, %xmm0
1718 ; AVX-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
1720 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1721 ; AVX-NEXT: vpsubb %xmm0, %xmm1, %xmm0
1723 %a = zext <16 x i8> %x to <16 x i16>
1724 %b = sub <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %a
1725 %c = trunc <16 x i16> %b to <16 x i8>
1733 define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
1734 ; SSE-LABEL: trunc_mul_v4i64_v4i32:
1736 ; SSE-NEXT: pmuludq %xmm3, %xmm1
1737 ; SSE-NEXT: pmuludq %xmm2, %xmm0
1738 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1741 ; AVX1-LABEL: trunc_mul_v4i64_v4i32:
1743 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1744 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
1745 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1746 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
1747 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1748 ; AVX1-NEXT: vzeroupper
1751 ; AVX2-SLOW-LABEL: trunc_mul_v4i64_v4i32:
1752 ; AVX2-SLOW: # %bb.0:
1753 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
1754 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
1755 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
1756 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
1757 ; AVX2-SLOW-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1758 ; AVX2-SLOW-NEXT: vzeroupper
1759 ; AVX2-SLOW-NEXT: retq
1761 ; AVX2-FAST-LABEL: trunc_mul_v4i64_v4i32:
1762 ; AVX2-FAST: # %bb.0:
1763 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
1764 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
1765 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
1766 ; AVX2-FAST-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1767 ; AVX2-FAST-NEXT: vzeroupper
1768 ; AVX2-FAST-NEXT: retq
1770 ; AVX512F-LABEL: trunc_mul_v4i64_v4i32:
1772 ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1773 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1774 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
1775 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
1776 ; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1777 ; AVX512F-NEXT: vzeroupper
1778 ; AVX512F-NEXT: retq
1780 ; AVX512BW-LABEL: trunc_mul_v4i64_v4i32:
1781 ; AVX512BW: # %bb.0:
1782 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1783 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1784 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
1785 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
1786 ; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1787 ; AVX512BW-NEXT: vzeroupper
1788 ; AVX512BW-NEXT: retq
1790 ; AVX512DQ-LABEL: trunc_mul_v4i64_v4i32:
1791 ; AVX512DQ: # %bb.0:
1792 ; AVX512DQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1793 ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1794 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
1795 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
1796 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1797 ; AVX512DQ-NEXT: vzeroupper
1798 ; AVX512DQ-NEXT: retq
1799 %1 = mul <4 x i64> %a0, %a1
1800 %2 = trunc <4 x i64> %1 to <4 x i32>
1804 define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
1805 ; SSE-LABEL: trunc_mul_v8i64_v8i16:
1807 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
1808 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
1809 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1810 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
1811 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
1812 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3]
1813 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,0,2,4,5,6,7]
1814 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
1815 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,2,4,5,6,7]
1816 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
1817 ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1]
1818 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1819 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1820 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1821 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
1822 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1823 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
1824 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
1825 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
1826 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
1827 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1828 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
1829 ; SSE-NEXT: pmullw %xmm6, %xmm0
1832 ; AVX1-LABEL: trunc_mul_v8i64_v8i16:
1834 ; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535]
1835 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
1836 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
1837 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
1838 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
1839 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
1840 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
1841 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1842 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
1843 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1844 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
1845 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
1846 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1847 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
1848 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1849 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
1850 ; AVX1-NEXT: vzeroupper
1853 ; AVX2-SLOW-LABEL: trunc_mul_v8i64_v8i16:
1854 ; AVX2-SLOW: # %bb.0:
1855 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
1856 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
1857 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
1858 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
1859 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
1860 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1861 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
1862 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
1863 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm4
1864 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2]
1865 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm4
1866 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
1867 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1868 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
1869 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1870 ; AVX2-SLOW-NEXT: vpmullw %xmm2, %xmm0, %xmm0
1871 ; AVX2-SLOW-NEXT: vzeroupper
1872 ; AVX2-SLOW-NEXT: retq
1874 ; AVX2-FAST-LABEL: trunc_mul_v8i64_v8i16:
1875 ; AVX2-FAST: # %bb.0:
1876 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
1877 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
1878 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
1879 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
1880 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1881 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
1882 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
1883 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
1884 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
1885 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1886 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
1887 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1888 ; AVX2-FAST-NEXT: vpmullw %xmm2, %xmm0, %xmm0
1889 ; AVX2-FAST-NEXT: vzeroupper
1890 ; AVX2-FAST-NEXT: retq
1892 ; AVX512F-LABEL: trunc_mul_v8i64_v8i16:
1894 ; AVX512F-NEXT: vpmovqw %zmm1, %xmm1
1895 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
1896 ; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1897 ; AVX512F-NEXT: vzeroupper
1898 ; AVX512F-NEXT: retq
1900 ; AVX512BW-LABEL: trunc_mul_v8i64_v8i16:
1901 ; AVX512BW: # %bb.0:
1902 ; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1
1903 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
1904 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1905 ; AVX512BW-NEXT: vzeroupper
1906 ; AVX512BW-NEXT: retq
1908 ; AVX512DQ-LABEL: trunc_mul_v8i64_v8i16:
1909 ; AVX512DQ: # %bb.0:
1910 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
1911 ; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0
1912 ; AVX512DQ-NEXT: vzeroupper
1913 ; AVX512DQ-NEXT: retq
1914 %1 = mul <8 x i64> %a0, %a1
1915 %2 = trunc <8 x i64> %1 to <8 x i16>
1919 define <8 x i16> @trunc_mul_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
1920 ; SSE-LABEL: trunc_mul_v8i32_v8i16:
1922 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
1923 ; SSE-NEXT: pmuludq %xmm2, %xmm0
1924 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1925 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1926 ; SSE-NEXT: pmuludq %xmm4, %xmm2
1927 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1928 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1929 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1930 ; SSE-NEXT: pmuludq %xmm3, %xmm1
1931 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1932 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1933 ; SSE-NEXT: pmuludq %xmm2, %xmm3
1934 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
1935 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1936 ; SSE-NEXT: pslld $16, %xmm1
1937 ; SSE-NEXT: psrad $16, %xmm1
1938 ; SSE-NEXT: pslld $16, %xmm0
1939 ; SSE-NEXT: psrad $16, %xmm0
1940 ; SSE-NEXT: packssdw %xmm1, %xmm0
1943 ; AVX1-LABEL: trunc_mul_v8i32_v8i16:
1945 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm2
1946 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1947 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1948 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1949 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1950 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
1951 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
1952 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1953 ; AVX1-NEXT: vzeroupper
1956 ; AVX2-LABEL: trunc_mul_v8i32_v8i16:
1958 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
1959 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1960 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1961 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1962 ; AVX2-NEXT: vzeroupper
1965 ; AVX512-LABEL: trunc_mul_v8i32_v8i16:
1967 ; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0
1968 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
1969 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1970 ; AVX512-NEXT: vzeroupper
1972 %1 = mul <8 x i32> %a0, %a1
1973 %2 = trunc <8 x i32> %1 to <8 x i16>
1977 define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
1978 ; SSE-LABEL: trunc_mul_v16i64_v16i8:
1980 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm0
1981 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm1
1982 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm2
1983 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm3
1984 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm4
1985 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm5
1986 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm6
1987 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm7
1988 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1989 ; SSE-NEXT: pand %xmm8, %xmm7
1990 ; SSE-NEXT: pand %xmm8, %xmm6
1991 ; SSE-NEXT: packuswb %xmm7, %xmm6
1992 ; SSE-NEXT: pand %xmm8, %xmm5
1993 ; SSE-NEXT: pand %xmm8, %xmm4
1994 ; SSE-NEXT: packuswb %xmm5, %xmm4
1995 ; SSE-NEXT: packuswb %xmm6, %xmm4
1996 ; SSE-NEXT: pand %xmm8, %xmm3
1997 ; SSE-NEXT: pand %xmm8, %xmm2
1998 ; SSE-NEXT: packuswb %xmm3, %xmm2
1999 ; SSE-NEXT: pand %xmm8, %xmm1
2000 ; SSE-NEXT: pand %xmm8, %xmm0
2001 ; SSE-NEXT: packuswb %xmm1, %xmm0
2002 ; SSE-NEXT: packuswb %xmm2, %xmm0
2003 ; SSE-NEXT: packuswb %xmm4, %xmm0
2006 ; AVX1-LABEL: trunc_mul_v16i64_v16i8:
2008 ; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm8
2009 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
2010 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2011 ; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm0
2012 ; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm4
2013 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
2014 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2015 ; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm1
2016 ; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm5
2017 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6
2018 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
2019 ; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm2
2020 ; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm6
2021 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7
2022 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
2023 ; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm3
2024 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255]
2025 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
2026 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
2027 ; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3
2028 ; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
2029 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
2030 ; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2
2031 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
2032 ; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1
2033 ; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3
2034 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
2035 ; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0
2036 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3
2037 ; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0
2038 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2039 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2040 ; AVX1-NEXT: vzeroupper
2043 ; AVX2-SLOW-LABEL: trunc_mul_v16i64_v16i8:
2044 ; AVX2-SLOW: # %bb.0:
2045 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm8
2046 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm7[0,2],xmm8[0,2]
2047 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm7
2048 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm7[0,2]
2049 ; AVX2-SLOW-NEXT: vpmulld %xmm8, %xmm3, %xmm3
2050 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm6, %xmm7
2051 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2]
2052 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm7
2053 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm7[0,2]
2054 ; AVX2-SLOW-NEXT: vpmulld %xmm6, %xmm2, %xmm2
2055 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
2056 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2057 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
2058 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
2059 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
2060 ; AVX2-SLOW-NEXT: vpand %xmm6, %xmm2, %xmm2
2061 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm7
2062 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,2],xmm7[0,2]
2063 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm7
2064 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2]
2065 ; AVX2-SLOW-NEXT: vpmulld %xmm5, %xmm1, %xmm1
2066 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm4, %xmm5
2067 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2]
2068 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
2069 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
2070 ; AVX2-SLOW-NEXT: vpmulld %xmm4, %xmm0, %xmm0
2071 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2072 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
2073 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2074 ; AVX2-SLOW-NEXT: vpand %xmm6, %xmm0, %xmm0
2075 ; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2076 ; AVX2-SLOW-NEXT: vzeroupper
2077 ; AVX2-SLOW-NEXT: retq
2079 ; AVX2-FAST-LABEL: trunc_mul_v16i64_v16i8:
2080 ; AVX2-FAST: # %bb.0:
2081 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,2,4,6,4,6,6,7]
2082 ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm8, %ymm7
2083 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm3
2084 ; AVX2-FAST-NEXT: vpmulld %xmm7, %xmm3, %xmm3
2085 ; AVX2-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm6
2086 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm8, %ymm2
2087 ; AVX2-FAST-NEXT: vpmulld %xmm6, %xmm2, %xmm2
2088 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
2089 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2090 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
2091 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
2092 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
2093 ; AVX2-FAST-NEXT: vpand %xmm6, %xmm2, %xmm2
2094 ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm5
2095 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm8, %ymm1
2096 ; AVX2-FAST-NEXT: vpmulld %xmm5, %xmm1, %xmm1
2097 ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm4
2098 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm0
2099 ; AVX2-FAST-NEXT: vpmulld %xmm4, %xmm0, %xmm0
2100 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2101 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
2102 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2103 ; AVX2-FAST-NEXT: vpand %xmm6, %xmm0, %xmm0
2104 ; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2105 ; AVX2-FAST-NEXT: vzeroupper
2106 ; AVX2-FAST-NEXT: retq
2108 ; AVX512F-LABEL: trunc_mul_v16i64_v16i8:
2110 ; AVX512F-NEXT: vpmovqd %zmm3, %ymm3
2111 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
2112 ; AVX512F-NEXT: vpmulld %ymm3, %ymm1, %ymm1
2113 ; AVX512F-NEXT: vpmovqd %zmm2, %ymm2
2114 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
2115 ; AVX512F-NEXT: vpmulld %ymm2, %ymm0, %ymm0
2116 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2117 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
2118 ; AVX512F-NEXT: vzeroupper
2119 ; AVX512F-NEXT: retq
2121 ; AVX512BW-LABEL: trunc_mul_v16i64_v16i8:
2122 ; AVX512BW: # %bb.0:
2123 ; AVX512BW-NEXT: vpmovqd %zmm3, %ymm3
2124 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
2125 ; AVX512BW-NEXT: vpmulld %ymm3, %ymm1, %ymm1
2126 ; AVX512BW-NEXT: vpmovqd %zmm2, %ymm2
2127 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
2128 ; AVX512BW-NEXT: vpmulld %ymm2, %ymm0, %ymm0
2129 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2130 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
2131 ; AVX512BW-NEXT: vzeroupper
2132 ; AVX512BW-NEXT: retq
2134 ; AVX512DQ-LABEL: trunc_mul_v16i64_v16i8:
2135 ; AVX512DQ: # %bb.0:
2136 ; AVX512DQ-NEXT: vpmullq %zmm3, %zmm1, %zmm1
2137 ; AVX512DQ-NEXT: vpmullq %zmm2, %zmm0, %zmm0
2138 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
2139 ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
2140 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2141 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
2142 ; AVX512DQ-NEXT: vzeroupper
2143 ; AVX512DQ-NEXT: retq
2144 %1 = mul <16 x i64> %a0, %a1
2145 %2 = trunc <16 x i64> %1 to <16 x i8>
2149 define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
2150 ; SSE-LABEL: trunc_mul_v16i32_v16i8:
2152 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
2153 ; SSE-NEXT: pmuludq %xmm4, %xmm0
2154 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2155 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2156 ; SSE-NEXT: pmuludq %xmm8, %xmm4
2157 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2158 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
2159 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
2160 ; SSE-NEXT: pmuludq %xmm5, %xmm1
2161 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2162 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
2163 ; SSE-NEXT: pmuludq %xmm4, %xmm5
2164 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
2165 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
2166 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
2167 ; SSE-NEXT: pmuludq %xmm6, %xmm2
2168 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2169 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
2170 ; SSE-NEXT: pmuludq %xmm4, %xmm5
2171 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
2172 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
2173 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
2174 ; SSE-NEXT: pmuludq %xmm7, %xmm3
2175 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
2176 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
2177 ; SSE-NEXT: pmuludq %xmm4, %xmm5
2178 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
2179 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
2180 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
2181 ; SSE-NEXT: pand %xmm4, %xmm3
2182 ; SSE-NEXT: pand %xmm4, %xmm2
2183 ; SSE-NEXT: packuswb %xmm3, %xmm2
2184 ; SSE-NEXT: pand %xmm4, %xmm1
2185 ; SSE-NEXT: pand %xmm4, %xmm0
2186 ; SSE-NEXT: packuswb %xmm1, %xmm0
2187 ; SSE-NEXT: packuswb %xmm2, %xmm0
2190 ; AVX1-LABEL: trunc_mul_v16i32_v16i8:
2192 ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm4
2193 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
2194 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2195 ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
2196 ; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm2
2197 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
2198 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2199 ; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1
2200 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255]
2201 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
2202 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
2203 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
2204 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
2205 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2
2206 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
2207 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2208 ; AVX1-NEXT: vzeroupper
2211 ; AVX2-LABEL: trunc_mul_v16i32_v16i8:
2213 ; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0
2214 ; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1
2215 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2216 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
2217 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2218 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2219 ; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
2220 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
2221 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2222 ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
2223 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2224 ; AVX2-NEXT: vzeroupper
2227 ; AVX512-LABEL: trunc_mul_v16i32_v16i8:
2229 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
2230 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
2231 ; AVX512-NEXT: vzeroupper
2233 %1 = mul <16 x i32> %a0, %a1
2234 %2 = trunc <16 x i32> %1 to <16 x i8>
2238 define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
2239 ; SSE-LABEL: trunc_mul_v16i16_v16i8:
2241 ; SSE-NEXT: pmullw %xmm2, %xmm0
2242 ; SSE-NEXT: pmullw %xmm3, %xmm1
2243 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
2244 ; SSE-NEXT: pand %xmm2, %xmm1
2245 ; SSE-NEXT: pand %xmm2, %xmm0
2246 ; SSE-NEXT: packuswb %xmm1, %xmm0
2249 ; AVX1-LABEL: trunc_mul_v16i16_v16i8:
2251 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm2
2252 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2253 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2254 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
2255 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
2256 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
2257 ; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1
2258 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
2259 ; AVX1-NEXT: vzeroupper
2262 ; AVX2-LABEL: trunc_mul_v16i16_v16i8:
2264 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
2265 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
2266 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2267 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2268 ; AVX2-NEXT: vzeroupper
2271 ; AVX512F-LABEL: trunc_mul_v16i16_v16i8:
2273 ; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0
2274 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2275 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
2276 ; AVX512F-NEXT: vzeroupper
2277 ; AVX512F-NEXT: retq
2279 ; AVX512BW-LABEL: trunc_mul_v16i16_v16i8:
2280 ; AVX512BW: # %bb.0:
2281 ; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0
2282 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
2283 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2284 ; AVX512BW-NEXT: vzeroupper
2285 ; AVX512BW-NEXT: retq
2287 ; AVX512DQ-LABEL: trunc_mul_v16i16_v16i8:
2288 ; AVX512DQ: # %bb.0:
2289 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
2290 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2291 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
2292 ; AVX512DQ-NEXT: vzeroupper
2293 ; AVX512DQ-NEXT: retq
2294 %1 = mul <16 x i16> %a0, %a1
2295 %2 = trunc <16 x i16> %1 to <16 x i8>
2299 define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
2300 ; SSE-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2302 ; SSE-NEXT: pxor %xmm3, %xmm3
2303 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2304 ; SSE-NEXT: pslld $16, %xmm2
2305 ; SSE-NEXT: psrad $16, %xmm2
2306 ; SSE-NEXT: pslld $16, %xmm1
2307 ; SSE-NEXT: psrad $16, %xmm1
2308 ; SSE-NEXT: packssdw %xmm2, %xmm1
2309 ; SSE-NEXT: pmullw %xmm1, %xmm0
2312 ; AVX1-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2314 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2315 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2316 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
2317 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
2318 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2319 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2320 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
2321 ; AVX1-NEXT: vzeroupper
2324 ; AVX2-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2326 ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2327 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2328 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2329 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
2330 ; AVX2-NEXT: vzeroupper
2333 ; AVX512-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2335 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
2336 ; AVX512-NEXT: vpmovdw %zmm1, %ymm1
2337 ; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2338 ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
2339 ; AVX512-NEXT: vzeroupper
2341 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2342 %2 = zext <8 x i8> %1 to <8 x i32>
2343 %3 = mul <8 x i32> %2, %a1
2344 %4 = trunc <8 x i32> %3 to <8 x i16>
2352 define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
2353 ; SSE-LABEL: trunc_mul_const_v4i64_v4i32:
2355 ; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm1
2356 ; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm0
2357 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2360 ; AVX1-LABEL: trunc_mul_const_v4i64_v4i32:
2362 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2363 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2364 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
2365 ; AVX1-NEXT: vzeroupper
2368 ; AVX2-SLOW-LABEL: trunc_mul_const_v4i64_v4i32:
2369 ; AVX2-SLOW: # %bb.0:
2370 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
2371 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2372 ; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
2373 ; AVX2-SLOW-NEXT: vzeroupper
2374 ; AVX2-SLOW-NEXT: retq
2376 ; AVX2-FAST-LABEL: trunc_mul_const_v4i64_v4i32:
2377 ; AVX2-FAST: # %bb.0:
2378 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
2379 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
2380 ; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
2381 ; AVX2-FAST-NEXT: vzeroupper
2382 ; AVX2-FAST-NEXT: retq
2384 ; AVX512-LABEL: trunc_mul_const_v4i64_v4i32:
2386 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
2387 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
2388 ; AVX512-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
2389 ; AVX512-NEXT: vzeroupper
2391 %1 = mul <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
2392 %2 = trunc <4 x i64> %1 to <4 x i32>
2396 define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
2397 ; SSE-LABEL: trunc_mul_const_v8i64_v8i16:
2399 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2400 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
2401 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2402 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
2403 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
2404 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
2405 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
2406 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
2407 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
2408 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2409 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
2410 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
2413 ; AVX1-LABEL: trunc_mul_const_v8i64_v8i16:
2415 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
2416 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
2417 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
2418 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
2419 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
2420 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2421 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
2422 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2423 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
2424 ; AVX1-NEXT: vzeroupper
2427 ; AVX2-SLOW-LABEL: trunc_mul_const_v8i64_v8i16:
2428 ; AVX2-SLOW: # %bb.0:
2429 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
2430 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
2431 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
2432 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
2433 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2434 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2435 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2436 ; AVX2-SLOW-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
2437 ; AVX2-SLOW-NEXT: vzeroupper
2438 ; AVX2-SLOW-NEXT: retq
2440 ; AVX2-FAST-LABEL: trunc_mul_const_v8i64_v8i16:
2441 ; AVX2-FAST: # %bb.0:
2442 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
2443 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
2444 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
2445 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2446 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2447 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2448 ; AVX2-FAST-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
2449 ; AVX2-FAST-NEXT: vzeroupper
2450 ; AVX2-FAST-NEXT: retq
2452 ; AVX512-LABEL: trunc_mul_const_v8i64_v8i16:
2454 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
2455 ; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
2456 ; AVX512-NEXT: vzeroupper
2458 %1 = mul <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
2459 %2 = trunc <8 x i64> %1 to <8 x i16>
2463 define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
2464 ; SSE-LABEL: trunc_mul_const_v8i32_v8i16:
2466 ; SSE-NEXT: pslld $16, %xmm1
2467 ; SSE-NEXT: psrad $16, %xmm1
2468 ; SSE-NEXT: pslld $16, %xmm0
2469 ; SSE-NEXT: psrad $16, %xmm0
2470 ; SSE-NEXT: packssdw %xmm1, %xmm0
2471 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
2474 ; AVX1-LABEL: trunc_mul_const_v8i32_v8i16:
2476 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2477 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2478 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
2479 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2480 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2481 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
2482 ; AVX1-NEXT: vzeroupper
2485 ; AVX2-LABEL: trunc_mul_const_v8i32_v8i16:
2487 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2488 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2489 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
2490 ; AVX2-NEXT: vzeroupper
2493 ; AVX512-LABEL: trunc_mul_const_v8i32_v8i16:
2495 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
2496 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
2497 ; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
2498 ; AVX512-NEXT: vzeroupper
2500 %1 = mul <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2501 %2 = trunc <8 x i32> %1 to <8 x i16>
2505 define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
2506 ; SSE-LABEL: trunc_mul_const_v16i64_v16i8:
2508 ; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm0
2509 ; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm1
2510 ; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm2
2511 ; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm3
2512 ; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm4
2513 ; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm5
2514 ; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm6
2515 ; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm7
2516 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
2517 ; SSE-NEXT: pand %xmm8, %xmm7
2518 ; SSE-NEXT: pand %xmm8, %xmm6
2519 ; SSE-NEXT: packuswb %xmm7, %xmm6
2520 ; SSE-NEXT: pand %xmm8, %xmm5
2521 ; SSE-NEXT: pand %xmm8, %xmm4
2522 ; SSE-NEXT: packuswb %xmm5, %xmm4
2523 ; SSE-NEXT: packuswb %xmm6, %xmm4
2524 ; SSE-NEXT: pand %xmm8, %xmm3
2525 ; SSE-NEXT: pand %xmm8, %xmm2
2526 ; SSE-NEXT: packuswb %xmm3, %xmm2
2527 ; SSE-NEXT: pand %xmm8, %xmm1
2528 ; SSE-NEXT: pand %xmm8, %xmm0
2529 ; SSE-NEXT: packuswb %xmm1, %xmm0
2530 ; SSE-NEXT: packuswb %xmm2, %xmm0
2531 ; SSE-NEXT: packuswb %xmm4, %xmm0
2534 ; AVX1-LABEL: trunc_mul_const_v16i64_v16i8:
2536 ; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm8
2537 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2538 ; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
2539 ; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm5
2540 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2541 ; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1
2542 ; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm6
2543 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
2544 ; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2
2545 ; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm3, %xmm7
2546 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
2547 ; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm3, %xmm3
2548 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255]
2549 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
2550 ; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7
2551 ; AVX1-NEXT: vpackusdw %xmm3, %xmm7, %xmm3
2552 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
2553 ; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6
2554 ; AVX1-NEXT: vpackusdw %xmm2, %xmm6, %xmm2
2555 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
2556 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
2557 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3
2558 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
2559 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
2560 ; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3
2561 ; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0
2562 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2563 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2564 ; AVX1-NEXT: vzeroupper
2567 ; AVX2-SLOW-LABEL: trunc_mul_const_v16i64_v16i8:
2568 ; AVX2-SLOW: # %bb.0:
2569 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
2570 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
2571 ; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2
2572 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
2573 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
2574 ; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3
2575 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
2576 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2577 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
2578 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
2579 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
2580 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
2581 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
2582 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
2583 ; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
2584 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
2585 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
2586 ; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
2587 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2588 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
2589 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2590 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
2591 ; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2592 ; AVX2-SLOW-NEXT: vzeroupper
2593 ; AVX2-SLOW-NEXT: retq
2595 ; AVX2-FAST-LABEL: trunc_mul_const_v16i64_v16i8:
2596 ; AVX2-FAST: # %bb.0:
2597 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
2598 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
2599 ; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2
2600 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
2601 ; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3
2602 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
2603 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2604 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
2605 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
2606 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
2607 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2
2608 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
2609 ; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
2610 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
2611 ; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
2612 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2613 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
2614 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2615 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0
2616 ; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2617 ; AVX2-FAST-NEXT: vzeroupper
2618 ; AVX2-FAST-NEXT: retq
2620 ; AVX512-LABEL: trunc_mul_const_v16i64_v16i8:
2622 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
2623 ; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
2624 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1
2625 ; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1
2626 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2627 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
2628 ; AVX512-NEXT: vzeroupper
2630 %1 = mul <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
2631 %2 = trunc <16 x i64> %1 to <16 x i8>
2635 define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
2636 ; SSE-LABEL: trunc_mul_const_v16i32_v16i8:
2638 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,1,2,3]
2639 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
2640 ; SSE-NEXT: pmuludq %xmm4, %xmm0
2641 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2642 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2643 ; SSE-NEXT: pmuludq %xmm5, %xmm4
2644 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2645 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
2646 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [4,5,6,7]
2647 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
2648 ; SSE-NEXT: pmuludq %xmm4, %xmm1
2649 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2650 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2651 ; SSE-NEXT: pmuludq %xmm5, %xmm4
2652 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2653 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
2654 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [8,9,10,11]
2655 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
2656 ; SSE-NEXT: pmuludq %xmm4, %xmm2
2657 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2658 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2659 ; SSE-NEXT: pmuludq %xmm5, %xmm4
2660 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2661 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
2662 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [12,13,14,15]
2663 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
2664 ; SSE-NEXT: pmuludq %xmm4, %xmm3
2665 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
2666 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2667 ; SSE-NEXT: pmuludq %xmm5, %xmm4
2668 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2669 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
2670 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
2671 ; SSE-NEXT: pand %xmm4, %xmm3
2672 ; SSE-NEXT: pand %xmm4, %xmm2
2673 ; SSE-NEXT: packuswb %xmm3, %xmm2
2674 ; SSE-NEXT: pand %xmm4, %xmm1
2675 ; SSE-NEXT: pand %xmm4, %xmm0
2676 ; SSE-NEXT: packuswb %xmm1, %xmm0
2677 ; SSE-NEXT: packuswb %xmm2, %xmm0
2680 ; AVX1-LABEL: trunc_mul_const_v16i32_v16i8:
2682 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
2683 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2684 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
2685 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm3
2686 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2687 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
2688 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255]
2689 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
2690 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
2691 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
2692 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
2693 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
2694 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
2695 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2696 ; AVX1-NEXT: vzeroupper
2699 ; AVX2-LABEL: trunc_mul_const_v16i32_v16i8:
2701 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2702 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
2703 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2704 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1
2705 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2706 ; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
2707 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
2708 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2709 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
2710 ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
2711 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2712 ; AVX2-NEXT: vzeroupper
2715 ; AVX512-LABEL: trunc_mul_const_v16i32_v16i8:
2717 ; AVX512-NEXT: vpmulld {{.*}}(%rip), %zmm0, %zmm0
2718 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
2719 ; AVX512-NEXT: vzeroupper
2721 %1 = mul <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2722 %2 = trunc <16 x i32> %1 to <16 x i8>
2726 define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
2727 ; SSE-LABEL: trunc_mul_const_v16i16_v16i8:
2729 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
2730 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1
2731 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
2732 ; SSE-NEXT: pand %xmm2, %xmm1
2733 ; SSE-NEXT: pand %xmm2, %xmm0
2734 ; SSE-NEXT: packuswb %xmm1, %xmm0
2737 ; AVX1-LABEL: trunc_mul_const_v16i16_v16i8:
2739 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
2740 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2741 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
2742 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
2743 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
2744 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
2745 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
2746 ; AVX1-NEXT: vzeroupper
2749 ; AVX2-LABEL: trunc_mul_const_v16i16_v16i8:
2751 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
2752 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
2753 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2754 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2755 ; AVX2-NEXT: vzeroupper
2758 ; AVX512F-LABEL: trunc_mul_const_v16i16_v16i8:
2760 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
2761 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2762 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
2763 ; AVX512F-NEXT: vzeroupper
2764 ; AVX512F-NEXT: retq
2766 ; AVX512BW-LABEL: trunc_mul_const_v16i16_v16i8:
2767 ; AVX512BW: # %bb.0:
2768 ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
2769 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
2770 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2771 ; AVX512BW-NEXT: vzeroupper
2772 ; AVX512BW-NEXT: retq
2774 ; AVX512DQ-LABEL: trunc_mul_const_v16i16_v16i8:
2775 ; AVX512DQ: # %bb.0:
2776 ; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
2777 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2778 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
2779 ; AVX512DQ-NEXT: vzeroupper
2780 ; AVX512DQ-NEXT: retq
2781 %1 = mul <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
2782 %2 = trunc <16 x i16> %1 to <16 x i8>
2790 define <4 x i32> @trunc_and_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2791 ; SSE-LABEL: trunc_and_v4i64_v4i32:
2793 ; SSE-NEXT: andps %xmm3, %xmm1
2794 ; SSE-NEXT: andps %xmm2, %xmm0
2795 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2798 ; AVX1-LABEL: trunc_and_v4i64_v4i32:
2800 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
2801 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2802 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2803 ; AVX1-NEXT: vzeroupper
2806 ; AVX2-SLOW-LABEL: trunc_and_v4i64_v4i32:
2807 ; AVX2-SLOW: # %bb.0:
2808 ; AVX2-SLOW-NEXT: vandps %ymm1, %ymm0, %ymm0
2809 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
2810 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2811 ; AVX2-SLOW-NEXT: vzeroupper
2812 ; AVX2-SLOW-NEXT: retq
2814 ; AVX2-FAST-LABEL: trunc_and_v4i64_v4i32:
2815 ; AVX2-FAST: # %bb.0:
2816 ; AVX2-FAST-NEXT: vandps %ymm1, %ymm0, %ymm0
2817 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
2818 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
2819 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2820 ; AVX2-FAST-NEXT: vzeroupper
2821 ; AVX2-FAST-NEXT: retq
2823 ; AVX512-LABEL: trunc_and_v4i64_v4i32:
2825 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
2826 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
2827 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2828 ; AVX512-NEXT: vzeroupper
2830 %1 = and <4 x i64> %a0, %a1
2831 %2 = trunc <4 x i64> %1 to <4 x i32>
2835 define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
2836 ; SSE-LABEL: trunc_and_v8i64_v8i16:
2838 ; SSE-NEXT: pand %xmm6, %xmm2
2839 ; SSE-NEXT: pand %xmm7, %xmm3
2840 ; SSE-NEXT: pand %xmm4, %xmm0
2841 ; SSE-NEXT: pand %xmm5, %xmm1
2842 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2843 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
2844 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2845 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
2846 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
2847 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
2848 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
2849 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
2850 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
2851 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2852 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
2855 ; AVX1-LABEL: trunc_and_v8i64_v8i16:
2857 ; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535]
2858 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
2859 ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
2860 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
2861 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
2862 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
2863 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
2864 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2865 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
2866 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2867 ; AVX1-NEXT: vzeroupper
2870 ; AVX2-SLOW-LABEL: trunc_and_v8i64_v8i16:
2871 ; AVX2-SLOW: # %bb.0:
2872 ; AVX2-SLOW-NEXT: vandps %ymm2, %ymm0, %ymm0
2873 ; AVX2-SLOW-NEXT: vandps %ymm3, %ymm1, %ymm1
2874 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
2875 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
2876 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
2877 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
2878 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2879 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2880 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2881 ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2882 ; AVX2-SLOW-NEXT: vzeroupper
2883 ; AVX2-SLOW-NEXT: retq
2885 ; AVX2-FAST-LABEL: trunc_and_v8i64_v8i16:
2886 ; AVX2-FAST: # %bb.0:
2887 ; AVX2-FAST-NEXT: vpand %ymm3, %ymm1, %ymm1
2888 ; AVX2-FAST-NEXT: vpand %ymm2, %ymm0, %ymm0
2889 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
2890 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
2891 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
2892 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2893 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2894 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2895 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2896 ; AVX2-FAST-NEXT: vzeroupper
2897 ; AVX2-FAST-NEXT: retq
2899 ; AVX512-LABEL: trunc_and_v8i64_v8i16:
2901 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
2902 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
2903 ; AVX512-NEXT: vzeroupper
2905 %1 = and <8 x i64> %a0, %a1
2906 %2 = trunc <8 x i64> %1 to <8 x i16>
2910 define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
2911 ; SSE-LABEL: trunc_and_v8i32_v8i16:
2913 ; SSE-NEXT: pand %xmm2, %xmm0
2914 ; SSE-NEXT: pand %xmm3, %xmm1
2915 ; SSE-NEXT: pslld $16, %xmm1
2916 ; SSE-NEXT: psrad $16, %xmm1
2917 ; SSE-NEXT: pslld $16, %xmm0
2918 ; SSE-NEXT: psrad $16, %xmm0
2919 ; SSE-NEXT: packssdw %xmm1, %xmm0
2922 ; AVX1-LABEL: trunc_and_v8i32_v8i16:
2924 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
2925 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2926 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2927 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
2928 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2929 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2930 ; AVX1-NEXT: vzeroupper
2933 ; AVX2-LABEL: trunc_and_v8i32_v8i16:
2935 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
2936 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2937 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2938 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2939 ; AVX2-NEXT: vzeroupper
2942 ; AVX512-LABEL: trunc_and_v8i32_v8i16:
2944 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
2945 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
2946 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2947 ; AVX512-NEXT: vzeroupper
2949 %1 = and <8 x i32> %a0, %a1
2950 %2 = trunc <8 x i32> %1 to <8 x i16>
2954 define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
2955 ; SSE-LABEL: trunc_and_v16i64_v16i8:
2957 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm0
2958 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm1
2959 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm2
2960 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm3
2961 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm4
2962 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm5
2963 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm6
2964 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm7
2965 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
2966 ; SSE-NEXT: pand %xmm8, %xmm7
2967 ; SSE-NEXT: pand %xmm8, %xmm6
2968 ; SSE-NEXT: packuswb %xmm7, %xmm6
2969 ; SSE-NEXT: pand %xmm8, %xmm5
2970 ; SSE-NEXT: pand %xmm8, %xmm4
2971 ; SSE-NEXT: packuswb %xmm5, %xmm4
2972 ; SSE-NEXT: packuswb %xmm6, %xmm4
2973 ; SSE-NEXT: pand %xmm8, %xmm3
2974 ; SSE-NEXT: pand %xmm8, %xmm2
2975 ; SSE-NEXT: packuswb %xmm3, %xmm2
2976 ; SSE-NEXT: pand %xmm8, %xmm1
2977 ; SSE-NEXT: pand %xmm8, %xmm0
2978 ; SSE-NEXT: packuswb %xmm1, %xmm0
2979 ; SSE-NEXT: packuswb %xmm2, %xmm0
2980 ; SSE-NEXT: packuswb %xmm4, %xmm0
2983 ; AVX1-LABEL: trunc_and_v16i64_v16i8:
2985 ; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [255,255,255,255]
2986 ; AVX1-NEXT: vandps %ymm8, %ymm7, %ymm7
2987 ; AVX1-NEXT: vandps %ymm7, %ymm3, %ymm3
2988 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm7
2989 ; AVX1-NEXT: vpackusdw %xmm7, %xmm3, %xmm3
2990 ; AVX1-NEXT: vandps %ymm8, %ymm6, %ymm6
2991 ; AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2
2992 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
2993 ; AVX1-NEXT: vpackusdw %xmm6, %xmm2, %xmm2
2994 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
2995 ; AVX1-NEXT: vandps %ymm8, %ymm5, %ymm3
2996 ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
2997 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
2998 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
2999 ; AVX1-NEXT: vandps %ymm8, %ymm4, %ymm3
3000 ; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0
3001 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
3002 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
3003 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3004 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
3005 ; AVX1-NEXT: vzeroupper
3008 ; AVX2-SLOW-LABEL: trunc_and_v16i64_v16i8:
3009 ; AVX2-SLOW: # %bb.0:
3010 ; AVX2-SLOW-NEXT: vandps %ymm4, %ymm0, %ymm0
3011 ; AVX2-SLOW-NEXT: vandps %ymm5, %ymm1, %ymm1
3012 ; AVX2-SLOW-NEXT: vandps %ymm6, %ymm2, %ymm2
3013 ; AVX2-SLOW-NEXT: vandps %ymm7, %ymm3, %ymm3
3014 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
3015 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
3016 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
3017 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
3018 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
3019 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3020 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
3021 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
3022 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
3023 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
3024 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
3025 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
3026 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
3027 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
3028 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
3029 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
3030 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3031 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
3032 ; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
3033 ; AVX2-SLOW-NEXT: vzeroupper
3034 ; AVX2-SLOW-NEXT: retq
3036 ; AVX2-FAST-LABEL: trunc_and_v16i64_v16i8:
3037 ; AVX2-FAST: # %bb.0:
3038 ; AVX2-FAST-NEXT: vpand %ymm5, %ymm1, %ymm1
3039 ; AVX2-FAST-NEXT: vpand %ymm4, %ymm0, %ymm0
3040 ; AVX2-FAST-NEXT: vpand %ymm7, %ymm3, %ymm3
3041 ; AVX2-FAST-NEXT: vpand %ymm6, %ymm2, %ymm2
3042 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
3043 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
3044 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
3045 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
3046 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3047 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
3048 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
3049 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
3050 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2
3051 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
3052 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
3053 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3054 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
3055 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3056 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0
3057 ; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
3058 ; AVX2-FAST-NEXT: vzeroupper
3059 ; AVX2-FAST-NEXT: retq
3061 ; AVX512-LABEL: trunc_and_v16i64_v16i8:
3063 ; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1
3064 ; AVX512-NEXT: vpandq %zmm2, %zmm0, %zmm0
3065 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
3066 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1
3067 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
3068 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
3069 ; AVX512-NEXT: vzeroupper
3071 %1 = and <16 x i64> %a0, %a1
3072 %2 = trunc <16 x i64> %1 to <16 x i8>
3076 define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
3077 ; SSE-LABEL: trunc_and_v16i32_v16i8:
3079 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3080 ; SSE-NEXT: pand %xmm8, %xmm7
3081 ; SSE-NEXT: pand %xmm3, %xmm7
3082 ; SSE-NEXT: pand %xmm8, %xmm6
3083 ; SSE-NEXT: pand %xmm2, %xmm6
3084 ; SSE-NEXT: packuswb %xmm7, %xmm6
3085 ; SSE-NEXT: pand %xmm8, %xmm5
3086 ; SSE-NEXT: pand %xmm1, %xmm5
3087 ; SSE-NEXT: pand %xmm8, %xmm4
3088 ; SSE-NEXT: pand %xmm4, %xmm0
3089 ; SSE-NEXT: packuswb %xmm5, %xmm0
3090 ; SSE-NEXT: packuswb %xmm6, %xmm0
3093 ; AVX1-LABEL: trunc_and_v16i32_v16i8:
3095 ; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
3096 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
3097 ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
3098 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3099 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3100 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
3101 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
3102 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3103 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
3104 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3105 ; AVX1-NEXT: vzeroupper
3108 ; AVX2-LABEL: trunc_and_v16i32_v16i8:
3110 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
3111 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
3112 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3113 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
3114 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
3115 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
3116 ; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
3117 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
3118 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3119 ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
3120 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3121 ; AVX2-NEXT: vzeroupper
3124 ; AVX512-LABEL: trunc_and_v16i32_v16i8:
3126 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
3127 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
3128 ; AVX512-NEXT: vzeroupper
3130 %1 = and <16 x i32> %a0, %a1
3131 %2 = trunc <16 x i32> %1 to <16 x i8>
3135 define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
3136 ; SSE-LABEL: trunc_and_v16i16_v16i8:
3138 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
3139 ; SSE-NEXT: pand %xmm4, %xmm3
3140 ; SSE-NEXT: pand %xmm1, %xmm3
3141 ; SSE-NEXT: pand %xmm4, %xmm2
3142 ; SSE-NEXT: pand %xmm2, %xmm0
3143 ; SSE-NEXT: packuswb %xmm3, %xmm0
3146 ; AVX1-LABEL: trunc_and_v16i16_v16i8:
3148 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
3149 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
3150 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3151 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3152 ; AVX1-NEXT: vzeroupper
3155 ; AVX2-LABEL: trunc_and_v16i16_v16i8:
3157 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
3158 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
3159 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3160 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3161 ; AVX2-NEXT: vzeroupper
3164 ; AVX512F-LABEL: trunc_and_v16i16_v16i8:
3166 ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
3167 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3168 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
3169 ; AVX512F-NEXT: vzeroupper
3170 ; AVX512F-NEXT: retq
3172 ; AVX512BW-LABEL: trunc_and_v16i16_v16i8:
3173 ; AVX512BW: # %bb.0:
3174 ; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0
3175 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
3176 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3177 ; AVX512BW-NEXT: vzeroupper
3178 ; AVX512BW-NEXT: retq
3180 ; AVX512DQ-LABEL: trunc_and_v16i16_v16i8:
3181 ; AVX512DQ: # %bb.0:
3182 ; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0
3183 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3184 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
3185 ; AVX512DQ-NEXT: vzeroupper
3186 ; AVX512DQ-NEXT: retq
3187 %1 = and <16 x i16> %a0, %a1
3188 %2 = trunc <16 x i16> %1 to <16 x i8>
3196 define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
3197 ; SSE-LABEL: trunc_and_const_v4i64_v4i32:
3199 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3200 ; SSE-NEXT: andps {{.*}}(%rip), %xmm0
3203 ; AVX1-LABEL: trunc_and_const_v4i64_v4i32:
3205 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3206 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3207 ; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
3208 ; AVX1-NEXT: vzeroupper
3211 ; AVX2-SLOW-LABEL: trunc_and_const_v4i64_v4i32:
3212 ; AVX2-SLOW: # %bb.0:
3213 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
3214 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3215 ; AVX2-SLOW-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
3216 ; AVX2-SLOW-NEXT: vzeroupper
3217 ; AVX2-SLOW-NEXT: retq
3219 ; AVX2-FAST-LABEL: trunc_and_const_v4i64_v4i32:
3220 ; AVX2-FAST: # %bb.0:
3221 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
3222 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
3223 ; AVX2-FAST-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
3224 ; AVX2-FAST-NEXT: vzeroupper
3225 ; AVX2-FAST-NEXT: retq
3227 ; AVX512-LABEL: trunc_and_const_v4i64_v4i32:
3229 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
3230 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
3231 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3232 ; AVX512-NEXT: vzeroupper
3234 %1 = and <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
3235 %2 = trunc <4 x i64> %1 to <4 x i32>
3239 define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
3240 ; SSE-LABEL: trunc_and_const_v8i64_v8i16:
3242 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3243 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
3244 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3245 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
3246 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3247 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
3248 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
3249 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
3250 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
3251 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3252 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
3253 ; SSE-NEXT: andpd {{.*}}(%rip), %xmm0
3256 ; AVX1-LABEL: trunc_and_const_v8i64_v8i16:
3258 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
3259 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
3260 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3261 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3262 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
3263 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3264 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
3265 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3266 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3267 ; AVX1-NEXT: vzeroupper
3270 ; AVX2-SLOW-LABEL: trunc_and_const_v8i64_v8i16:
3271 ; AVX2-SLOW: # %bb.0:
3272 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
3273 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
3274 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
3275 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
3276 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
3277 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3278 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3279 ; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3280 ; AVX2-SLOW-NEXT: vzeroupper
3281 ; AVX2-SLOW-NEXT: retq
3283 ; AVX2-FAST-LABEL: trunc_and_const_v8i64_v8i16:
3284 ; AVX2-FAST: # %bb.0:
3285 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
3286 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
3287 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
3288 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3289 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3290 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3291 ; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3292 ; AVX2-FAST-NEXT: vzeroupper
3293 ; AVX2-FAST-NEXT: retq
3295 ; AVX512-LABEL: trunc_and_const_v8i64_v8i16:
3297 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
3298 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3299 ; AVX512-NEXT: vzeroupper
3301 %1 = and <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
3302 %2 = trunc <8 x i64> %1 to <8 x i16>
3306 define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
3307 ; SSE-LABEL: trunc_and_const_v8i32_v8i16:
3309 ; SSE-NEXT: pslld $16, %xmm1
3310 ; SSE-NEXT: psrad $16, %xmm1
3311 ; SSE-NEXT: pslld $16, %xmm0
3312 ; SSE-NEXT: psrad $16, %xmm0
3313 ; SSE-NEXT: packssdw %xmm1, %xmm0
3314 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
3317 ; AVX1-LABEL: trunc_and_const_v8i32_v8i16:
3319 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3320 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
3321 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
3322 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
3323 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3324 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3325 ; AVX1-NEXT: vzeroupper
3328 ; AVX2-LABEL: trunc_and_const_v8i32_v8i16:
3330 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3331 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3332 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3333 ; AVX2-NEXT: vzeroupper
3336 ; AVX512-LABEL: trunc_and_const_v8i32_v8i16:
3338 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
3339 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
3340 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3341 ; AVX512-NEXT: vzeroupper
3343 %1 = and <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3344 %2 = trunc <8 x i32> %1 to <8 x i16>
3348 define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
3349 ; SSE-LABEL: trunc_and_const_v16i64_v16i8:
3351 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3352 ; SSE-NEXT: pand %xmm8, %xmm7
3353 ; SSE-NEXT: pand %xmm8, %xmm6
3354 ; SSE-NEXT: packuswb %xmm7, %xmm6
3355 ; SSE-NEXT: pand %xmm8, %xmm5
3356 ; SSE-NEXT: pand %xmm8, %xmm4
3357 ; SSE-NEXT: packuswb %xmm5, %xmm4
3358 ; SSE-NEXT: packuswb %xmm6, %xmm4
3359 ; SSE-NEXT: pand %xmm8, %xmm3
3360 ; SSE-NEXT: pand %xmm8, %xmm2
3361 ; SSE-NEXT: packuswb %xmm3, %xmm2
3362 ; SSE-NEXT: pand %xmm8, %xmm1
3363 ; SSE-NEXT: pand %xmm8, %xmm0
3364 ; SSE-NEXT: packuswb %xmm1, %xmm0
3365 ; SSE-NEXT: packuswb %xmm2, %xmm0
3366 ; SSE-NEXT: packuswb %xmm4, %xmm0
3367 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
3370 ; AVX1-LABEL: trunc_and_const_v16i64_v16i8:
3372 ; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
3373 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
3374 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
3375 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
3376 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
3377 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
3378 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
3379 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
3380 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
3381 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3382 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3383 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
3384 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
3385 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
3386 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3387 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
3388 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3389 ; AVX1-NEXT: vzeroupper
3392 ; AVX2-SLOW-LABEL: trunc_and_const_v16i64_v16i8:
3393 ; AVX2-SLOW: # %bb.0:
3394 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
3395 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
3396 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
3397 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
3398 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
3399 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3400 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
3401 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
3402 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
3403 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
3404 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
3405 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
3406 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
3407 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
3408 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
3409 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
3410 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3411 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
3412 ; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
3413 ; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3414 ; AVX2-SLOW-NEXT: vzeroupper
3415 ; AVX2-SLOW-NEXT: retq
3417 ; AVX2-FAST-LABEL: trunc_and_const_v16i64_v16i8:
3418 ; AVX2-FAST: # %bb.0:
3419 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
3420 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
3421 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
3422 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
3423 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3424 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
3425 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
3426 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
3427 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2
3428 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
3429 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
3430 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3431 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
3432 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3433 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0
3434 ; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
3435 ; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3436 ; AVX2-FAST-NEXT: vzeroupper
3437 ; AVX2-FAST-NEXT: retq
3439 ; AVX512-LABEL: trunc_and_const_v16i64_v16i8:
3441 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
3442 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1
3443 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
3444 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
3445 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3446 ; AVX512-NEXT: vzeroupper
3448 %1 = and <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
3449 %2 = trunc <16 x i64> %1 to <16 x i8>
3453 define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
3454 ; SSE-LABEL: trunc_and_const_v16i32_v16i8:
3456 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3457 ; SSE-NEXT: pand %xmm4, %xmm3
3458 ; SSE-NEXT: pand %xmm4, %xmm2
3459 ; SSE-NEXT: packuswb %xmm3, %xmm2
3460 ; SSE-NEXT: pand %xmm4, %xmm1
3461 ; SSE-NEXT: pand %xmm4, %xmm0
3462 ; SSE-NEXT: packuswb %xmm1, %xmm0
3463 ; SSE-NEXT: packuswb %xmm2, %xmm0
3464 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
3467 ; AVX1-LABEL: trunc_and_const_v16i32_v16i8:
3469 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
3470 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
3471 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3472 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3473 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
3474 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3475 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
3476 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3477 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3478 ; AVX1-NEXT: vzeroupper
3481 ; AVX2-LABEL: trunc_and_const_v16i32_v16i8:
3483 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3484 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
3485 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
3486 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
3487 ; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
3488 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
3489 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3490 ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
3491 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3492 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3493 ; AVX2-NEXT: vzeroupper
3496 ; AVX512-LABEL: trunc_and_const_v16i32_v16i8:
3498 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
3499 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3500 ; AVX512-NEXT: vzeroupper
3502 %1 = and <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3503 %2 = trunc <16 x i32> %1 to <16 x i8>
3507 define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
3508 ; SSE-LABEL: trunc_and_const_v16i16_v16i8:
3510 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
3511 ; SSE-NEXT: pand %xmm2, %xmm1
3512 ; SSE-NEXT: pand %xmm2, %xmm0
3513 ; SSE-NEXT: packuswb %xmm1, %xmm0
3514 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
3517 ; AVX1-LABEL: trunc_and_const_v16i16_v16i8:
3519 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
3520 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3521 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3522 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3523 ; AVX1-NEXT: vzeroupper
3526 ; AVX2-LABEL: trunc_and_const_v16i16_v16i8:
3528 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
3529 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3530 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3531 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3532 ; AVX2-NEXT: vzeroupper
3535 ; AVX512F-LABEL: trunc_and_const_v16i16_v16i8:
3537 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3538 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
3539 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3540 ; AVX512F-NEXT: vzeroupper
3541 ; AVX512F-NEXT: retq
3543 ; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8:
3544 ; AVX512BW: # %bb.0:
3545 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
3546 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
3547 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3548 ; AVX512BW-NEXT: vzeroupper
3549 ; AVX512BW-NEXT: retq
3551 ; AVX512DQ-LABEL: trunc_and_const_v16i16_v16i8:
3552 ; AVX512DQ: # %bb.0:
3553 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3554 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
3555 ; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3556 ; AVX512DQ-NEXT: vzeroupper
3557 ; AVX512DQ-NEXT: retq
3558 %1 = and <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
3559 %2 = trunc <16 x i16> %1 to <16 x i8>
3567 define <4 x i32> @trunc_xor_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
3568 ; SSE-LABEL: trunc_xor_v4i64_v4i32:
3570 ; SSE-NEXT: xorps %xmm3, %xmm1
3571 ; SSE-NEXT: xorps %xmm2, %xmm0
3572 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3575 ; AVX1-LABEL: trunc_xor_v4i64_v4i32:
3577 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
3578 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3579 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3580 ; AVX1-NEXT: vzeroupper
3583 ; AVX2-SLOW-LABEL: trunc_xor_v4i64_v4i32:
3584 ; AVX2-SLOW: # %bb.0:
3585 ; AVX2-SLOW-NEXT: vxorps %ymm1, %ymm0, %ymm0
3586 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
3587 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3588 ; AVX2-SLOW-NEXT: vzeroupper
3589 ; AVX2-SLOW-NEXT: retq
3591 ; AVX2-FAST-LABEL: trunc_xor_v4i64_v4i32:
3592 ; AVX2-FAST: # %bb.0:
3593 ; AVX2-FAST-NEXT: vxorps %ymm1, %ymm0, %ymm0
3594 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
3595 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
3596 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3597 ; AVX2-FAST-NEXT: vzeroupper
3598 ; AVX2-FAST-NEXT: retq
3600 ; AVX512-LABEL: trunc_xor_v4i64_v4i32:
3602 ; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
3603 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
3604 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3605 ; AVX512-NEXT: vzeroupper
3607 %1 = xor <4 x i64> %a0, %a1
3608 %2 = trunc <4 x i64> %1 to <4 x i32>
3612 define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
3613 ; SSE-LABEL: trunc_xor_v8i64_v8i16:
3615 ; SSE-NEXT: pxor %xmm6, %xmm2
3616 ; SSE-NEXT: pxor %xmm7, %xmm3
3617 ; SSE-NEXT: pxor %xmm4, %xmm0
3618 ; SSE-NEXT: pxor %xmm5, %xmm1
3619 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3620 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
3621 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3622 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
3623 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3624 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
3625 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
3626 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
3627 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
3628 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3629 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
3632 ; AVX1-LABEL: trunc_xor_v8i64_v8i16:
3634 ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
3635 ; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1
3636 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
3637 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
3638 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3639 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3640 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
3641 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3642 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
3643 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3644 ; AVX1-NEXT: vzeroupper
3647 ; AVX2-SLOW-LABEL: trunc_xor_v8i64_v8i16:
3648 ; AVX2-SLOW: # %bb.0:
3649 ; AVX2-SLOW-NEXT: vxorps %ymm2, %ymm0, %ymm0
3650 ; AVX2-SLOW-NEXT: vxorps %ymm3, %ymm1, %ymm1
3651 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
3652 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
3653 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
3654 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
3655 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
3656 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3657 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3658 ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3659 ; AVX2-SLOW-NEXT: vzeroupper
3660 ; AVX2-SLOW-NEXT: retq
3662 ; AVX2-FAST-LABEL: trunc_xor_v8i64_v8i16:
3663 ; AVX2-FAST: # %bb.0:
3664 ; AVX2-FAST-NEXT: vpxor %ymm3, %ymm1, %ymm1
3665 ; AVX2-FAST-NEXT: vpxor %ymm2, %ymm0, %ymm0
3666 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
3667 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
3668 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
3669 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3670 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3671 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3672 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3673 ; AVX2-FAST-NEXT: vzeroupper
3674 ; AVX2-FAST-NEXT: retq
3676 ; AVX512-LABEL: trunc_xor_v8i64_v8i16:
3678 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
3679 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
3680 ; AVX512-NEXT: vzeroupper
3682 %1 = xor <8 x i64> %a0, %a1
3683 %2 = trunc <8 x i64> %1 to <8 x i16>
3687 define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
3688 ; SSE-LABEL: trunc_xor_v8i32_v8i16:
3690 ; SSE-NEXT: pxor %xmm2, %xmm0
3691 ; SSE-NEXT: pxor %xmm3, %xmm1
3692 ; SSE-NEXT: pslld $16, %xmm1
3693 ; SSE-NEXT: psrad $16, %xmm1
3694 ; SSE-NEXT: pslld $16, %xmm0
3695 ; SSE-NEXT: psrad $16, %xmm0
3696 ; SSE-NEXT: packssdw %xmm1, %xmm0
3699 ; AVX1-LABEL: trunc_xor_v8i32_v8i16:
3701 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
3702 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3703 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
3704 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
3705 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
3706 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3707 ; AVX1-NEXT: vzeroupper
3710 ; AVX2-LABEL: trunc_xor_v8i32_v8i16:
3712 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
3713 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3714 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3715 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3716 ; AVX2-NEXT: vzeroupper
3719 ; AVX512-LABEL: trunc_xor_v8i32_v8i16:
3721 ; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
3722 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
3723 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3724 ; AVX512-NEXT: vzeroupper
3726 %1 = xor <8 x i32> %a0, %a1
3727 %2 = trunc <8 x i32> %1 to <8 x i16>
3731 define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
3732 ; SSE-LABEL: trunc_xor_v16i64_v16i8:
3734 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm0
3735 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm1
3736 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm2
3737 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm3
3738 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm4
3739 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm5
3740 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm6
3741 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm7
3742 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3743 ; SSE-NEXT: pand %xmm8, %xmm7
3744 ; SSE-NEXT: pand %xmm8, %xmm6
3745 ; SSE-NEXT: packuswb %xmm7, %xmm6
3746 ; SSE-NEXT: pand %xmm8, %xmm5
3747 ; SSE-NEXT: pand %xmm8, %xmm4
3748 ; SSE-NEXT: packuswb %xmm5, %xmm4
3749 ; SSE-NEXT: packuswb %xmm6, %xmm4
3750 ; SSE-NEXT: pand %xmm8, %xmm3
3751 ; SSE-NEXT: pand %xmm8, %xmm2
3752 ; SSE-NEXT: packuswb %xmm3, %xmm2
3753 ; SSE-NEXT: pand %xmm8, %xmm1
3754 ; SSE-NEXT: pand %xmm8, %xmm0
3755 ; SSE-NEXT: packuswb %xmm1, %xmm0
3756 ; SSE-NEXT: packuswb %xmm2, %xmm0
3757 ; SSE-NEXT: packuswb %xmm4, %xmm0
3760 ; AVX1-LABEL: trunc_xor_v16i64_v16i8:
3762 ; AVX1-NEXT: vxorps %ymm4, %ymm0, %ymm0
3763 ; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1
3764 ; AVX1-NEXT: vxorps %ymm6, %ymm2, %ymm2
3765 ; AVX1-NEXT: vxorps %ymm7, %ymm3, %ymm3
3766 ; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
3767 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
3768 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
3769 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
3770 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
3771 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
3772 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
3773 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
3774 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
3775 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3776 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3777 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
3778 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
3779 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
3780 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3781 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
3782 ; AVX1-NEXT: vzeroupper
3785 ; AVX2-SLOW-LABEL: trunc_xor_v16i64_v16i8:
3786 ; AVX2-SLOW: # %bb.0:
3787 ; AVX2-SLOW-NEXT: vxorps %ymm4, %ymm0, %ymm0
3788 ; AVX2-SLOW-NEXT: vxorps %ymm5, %ymm1, %ymm1
3789 ; AVX2-SLOW-NEXT: vxorps %ymm6, %ymm2, %ymm2
3790 ; AVX2-SLOW-NEXT: vxorps %ymm7, %ymm3, %ymm3
3791 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
3792 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
3793 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
3794 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
3795 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
3796 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3797 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
3798 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
3799 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
3800 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
3801 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
3802 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
3803 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
3804 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
3805 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
3806 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
3807 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3808 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
3809 ; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
3810 ; AVX2-SLOW-NEXT: vzeroupper
3811 ; AVX2-SLOW-NEXT: retq
3813 ; AVX2-FAST-LABEL: trunc_xor_v16i64_v16i8:
3814 ; AVX2-FAST: # %bb.0:
3815 ; AVX2-FAST-NEXT: vpxor %ymm5, %ymm1, %ymm1
3816 ; AVX2-FAST-NEXT: vpxor %ymm4, %ymm0, %ymm0
3817 ; AVX2-FAST-NEXT: vpxor %ymm7, %ymm3, %ymm3
3818 ; AVX2-FAST-NEXT: vpxor %ymm6, %ymm2, %ymm2
3819 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
3820 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
3821 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
3822 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
3823 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3824 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
3825 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
3826 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
3827 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2
3828 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
3829 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
3830 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3831 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
3832 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3833 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0
3834 ; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
3835 ; AVX2-FAST-NEXT: vzeroupper
3836 ; AVX2-FAST-NEXT: retq
3838 ; AVX512-LABEL: trunc_xor_v16i64_v16i8:
3840 ; AVX512-NEXT: vpxorq %zmm3, %zmm1, %zmm1
3841 ; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0
3842 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
3843 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1
3844 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
3845 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
3846 ; AVX512-NEXT: vzeroupper
3848 %1 = xor <16 x i64> %a0, %a1
3849 %2 = trunc <16 x i64> %1 to <16 x i8>
3853 define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
3854 ; SSE-LABEL: trunc_xor_v16i32_v16i8:
3856 ; SSE-NEXT: pxor %xmm4, %xmm0
3857 ; SSE-NEXT: pxor %xmm5, %xmm1
3858 ; SSE-NEXT: pxor %xmm6, %xmm2
3859 ; SSE-NEXT: pxor %xmm7, %xmm3
3860 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3861 ; SSE-NEXT: pand %xmm4, %xmm3
3862 ; SSE-NEXT: pand %xmm4, %xmm2
3863 ; SSE-NEXT: packuswb %xmm3, %xmm2
3864 ; SSE-NEXT: pand %xmm4, %xmm1
3865 ; SSE-NEXT: pand %xmm4, %xmm0
3866 ; SSE-NEXT: packuswb %xmm1, %xmm0
3867 ; SSE-NEXT: packuswb %xmm2, %xmm0
3870 ; AVX1-LABEL: trunc_xor_v16i32_v16i8:
3872 ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
3873 ; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1
3874 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
3875 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
3876 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3877 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3878 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
3879 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3880 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
3881 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3882 ; AVX1-NEXT: vzeroupper
3885 ; AVX2-LABEL: trunc_xor_v16i32_v16i8:
3887 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
3888 ; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
3889 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3890 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
3891 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
3892 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
3893 ; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
3894 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
3895 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3896 ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
3897 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3898 ; AVX2-NEXT: vzeroupper
3901 ; AVX512-LABEL: trunc_xor_v16i32_v16i8:
3903 ; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0
3904 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
3905 ; AVX512-NEXT: vzeroupper
3907 %1 = xor <16 x i32> %a0, %a1
3908 %2 = trunc <16 x i32> %1 to <16 x i8>
3912 define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
3913 ; SSE-LABEL: trunc_xor_v16i16_v16i8:
3915 ; SSE-NEXT: pxor %xmm2, %xmm0
3916 ; SSE-NEXT: pxor %xmm3, %xmm1
3917 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
3918 ; SSE-NEXT: pand %xmm2, %xmm1
3919 ; SSE-NEXT: pand %xmm2, %xmm0
3920 ; SSE-NEXT: packuswb %xmm1, %xmm0
3923 ; AVX1-LABEL: trunc_xor_v16i16_v16i8:
3925 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
3926 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
3927 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3928 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3929 ; AVX1-NEXT: vzeroupper
3932 ; AVX2-LABEL: trunc_xor_v16i16_v16i8:
3934 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
3935 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
3936 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3937 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3938 ; AVX2-NEXT: vzeroupper
3941 ; AVX512F-LABEL: trunc_xor_v16i16_v16i8:
3943 ; AVX512F-NEXT: vpxor %ymm1, %ymm0, %ymm0
3944 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3945 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
3946 ; AVX512F-NEXT: vzeroupper
3947 ; AVX512F-NEXT: retq
3949 ; AVX512BW-LABEL: trunc_xor_v16i16_v16i8:
3950 ; AVX512BW: # %bb.0:
3951 ; AVX512BW-NEXT: vpxor %ymm1, %ymm0, %ymm0
3952 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
3953 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3954 ; AVX512BW-NEXT: vzeroupper
3955 ; AVX512BW-NEXT: retq
3957 ; AVX512DQ-LABEL: trunc_xor_v16i16_v16i8:
3958 ; AVX512DQ: # %bb.0:
3959 ; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0
3960 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3961 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
3962 ; AVX512DQ-NEXT: vzeroupper
3963 ; AVX512DQ-NEXT: retq
3964 %1 = xor <16 x i16> %a0, %a1
3965 %2 = trunc <16 x i16> %1 to <16 x i8>
3973 define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
3974 ; SSE-LABEL: trunc_xor_const_v4i64_v4i32:
3976 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3977 ; SSE-NEXT: xorps {{.*}}(%rip), %xmm0
3980 ; AVX1-LABEL: trunc_xor_const_v4i64_v4i32:
3982 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3983 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3984 ; AVX1-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
3985 ; AVX1-NEXT: vzeroupper
3988 ; AVX2-SLOW-LABEL: trunc_xor_const_v4i64_v4i32:
3989 ; AVX2-SLOW: # %bb.0:
3990 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
3991 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3992 ; AVX2-SLOW-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
3993 ; AVX2-SLOW-NEXT: vzeroupper
3994 ; AVX2-SLOW-NEXT: retq
3996 ; AVX2-FAST-LABEL: trunc_xor_const_v4i64_v4i32:
3997 ; AVX2-FAST: # %bb.0:
3998 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
3999 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
4000 ; AVX2-FAST-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
4001 ; AVX2-FAST-NEXT: vzeroupper
4002 ; AVX2-FAST-NEXT: retq
4004 ; AVX512-LABEL: trunc_xor_const_v4i64_v4i32:
4006 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
4007 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
4008 ; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4009 ; AVX512-NEXT: vzeroupper
4011 %1 = xor <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
4012 %2 = trunc <4 x i64> %1 to <4 x i32>
4016 define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
4017 ; SSE-LABEL: trunc_xor_const_v8i64_v8i16:
4019 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4020 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
4021 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4022 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
4023 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
4024 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
4025 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
4026 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
4027 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
4028 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4029 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
4030 ; SSE-NEXT: xorpd {{.*}}(%rip), %xmm0
4033 ; AVX1-LABEL: trunc_xor_const_v8i64_v8i16:
4035 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
4036 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
4037 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
4038 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
4039 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
4040 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
4041 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
4042 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
4043 ; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4044 ; AVX1-NEXT: vzeroupper
4047 ; AVX2-SLOW-LABEL: trunc_xor_const_v8i64_v8i16:
4048 ; AVX2-SLOW: # %bb.0:
4049 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
4050 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
4051 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
4052 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
4053 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
4054 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4055 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4056 ; AVX2-SLOW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4057 ; AVX2-SLOW-NEXT: vzeroupper
4058 ; AVX2-SLOW-NEXT: retq
4060 ; AVX2-FAST-LABEL: trunc_xor_const_v8i64_v8i16:
4061 ; AVX2-FAST: # %bb.0:
4062 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
4063 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
4064 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
4065 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
4066 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4067 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4068 ; AVX2-FAST-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4069 ; AVX2-FAST-NEXT: vzeroupper
4070 ; AVX2-FAST-NEXT: retq
4072 ; AVX512-LABEL: trunc_xor_const_v8i64_v8i16:
4074 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
4075 ; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4076 ; AVX512-NEXT: vzeroupper
4078 %1 = xor <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
4079 %2 = trunc <8 x i64> %1 to <8 x i16>
4083 define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
4084 ; SSE-LABEL: trunc_xor_const_v8i32_v8i16:
4086 ; SSE-NEXT: pslld $16, %xmm1
4087 ; SSE-NEXT: psrad $16, %xmm1
4088 ; SSE-NEXT: pslld $16, %xmm0
4089 ; SSE-NEXT: psrad $16, %xmm0
4090 ; SSE-NEXT: packssdw %xmm1, %xmm0
4091 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
4094 ; AVX1-LABEL: trunc_xor_const_v8i32_v8i16:
4096 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4097 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
4098 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
4099 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
4100 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4101 ; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4102 ; AVX1-NEXT: vzeroupper
4105 ; AVX2-LABEL: trunc_xor_const_v8i32_v8i16:
4107 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4108 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4109 ; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4110 ; AVX2-NEXT: vzeroupper
4113 ; AVX512-LABEL: trunc_xor_const_v8i32_v8i16:
4115 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
4116 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
4117 ; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4118 ; AVX512-NEXT: vzeroupper
4120 %1 = xor <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
4121 %2 = trunc <8 x i32> %1 to <8 x i16>
4125 define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
4126 ; SSE-LABEL: trunc_xor_const_v16i64_v16i8:
4128 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4129 ; SSE-NEXT: pand %xmm8, %xmm7
4130 ; SSE-NEXT: pand %xmm8, %xmm6
4131 ; SSE-NEXT: packuswb %xmm7, %xmm6
4132 ; SSE-NEXT: pand %xmm8, %xmm5
4133 ; SSE-NEXT: pand %xmm8, %xmm4
4134 ; SSE-NEXT: packuswb %xmm5, %xmm4
4135 ; SSE-NEXT: packuswb %xmm6, %xmm4
4136 ; SSE-NEXT: pand %xmm8, %xmm3
4137 ; SSE-NEXT: pand %xmm8, %xmm2
4138 ; SSE-NEXT: packuswb %xmm3, %xmm2
4139 ; SSE-NEXT: pand %xmm8, %xmm1
4140 ; SSE-NEXT: pand %xmm8, %xmm0
4141 ; SSE-NEXT: packuswb %xmm1, %xmm0
4142 ; SSE-NEXT: packuswb %xmm2, %xmm0
4143 ; SSE-NEXT: packuswb %xmm4, %xmm0
4144 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
4147 ; AVX1-LABEL: trunc_xor_const_v16i64_v16i8:
4149 ; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
4150 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
4151 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
4152 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
4153 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
4154 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
4155 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
4156 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4157 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
4158 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
4159 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
4160 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
4161 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
4162 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
4163 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
4164 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
4165 ; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4166 ; AVX1-NEXT: vzeroupper
4169 ; AVX2-SLOW-LABEL: trunc_xor_const_v16i64_v16i8:
4170 ; AVX2-SLOW: # %bb.0:
4171 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
4172 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
4173 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
4174 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
4175 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
4176 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4177 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
4178 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
4179 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
4180 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
4181 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
4182 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
4183 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
4184 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
4185 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
4186 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
4187 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4188 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
4189 ; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
4190 ; AVX2-SLOW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4191 ; AVX2-SLOW-NEXT: vzeroupper
4192 ; AVX2-SLOW-NEXT: retq
4194 ; AVX2-FAST-LABEL: trunc_xor_const_v16i64_v16i8:
4195 ; AVX2-FAST: # %bb.0:
4196 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
4197 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
4198 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
4199 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
4200 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4201 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
4202 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
4203 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
4204 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2
4205 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
4206 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
4207 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
4208 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
4209 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4210 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0
4211 ; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
4212 ; AVX2-FAST-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4213 ; AVX2-FAST-NEXT: vzeroupper
4214 ; AVX2-FAST-NEXT: retq
4216 ; AVX512-LABEL: trunc_xor_const_v16i64_v16i8:
4218 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
4219 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1
4220 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4221 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
4222 ; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4223 ; AVX512-NEXT: vzeroupper
4225 %1 = xor <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
4226 %2 = trunc <16 x i64> %1 to <16 x i8>
4230 define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
4231 ; SSE-LABEL: trunc_xor_const_v16i32_v16i8:
4233 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4234 ; SSE-NEXT: pand %xmm4, %xmm3
4235 ; SSE-NEXT: pand %xmm4, %xmm2
4236 ; SSE-NEXT: packuswb %xmm3, %xmm2
4237 ; SSE-NEXT: pand %xmm4, %xmm1
4238 ; SSE-NEXT: pand %xmm4, %xmm0
4239 ; SSE-NEXT: packuswb %xmm1, %xmm0
4240 ; SSE-NEXT: packuswb %xmm2, %xmm0
4241 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
4244 ; AVX1-LABEL: trunc_xor_const_v16i32_v16i8:
4246 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
4247 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
4248 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
4249 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
4250 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
4251 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
4252 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
4253 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4254 ; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4255 ; AVX1-NEXT: vzeroupper
4258 ; AVX2-LABEL: trunc_xor_const_v16i32_v16i8:
4260 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4261 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
4262 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
4263 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
4264 ; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
4265 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
4266 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4267 ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
4268 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4269 ; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4270 ; AVX2-NEXT: vzeroupper
4273 ; AVX512-LABEL: trunc_xor_const_v16i32_v16i8:
4275 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
4276 ; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4277 ; AVX512-NEXT: vzeroupper
4279 %1 = xor <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
4280 %2 = trunc <16 x i32> %1 to <16 x i8>
4284 define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
4285 ; SSE-LABEL: trunc_xor_const_v16i16_v16i8:
4287 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
4288 ; SSE-NEXT: pand %xmm2, %xmm1
4289 ; SSE-NEXT: pand %xmm2, %xmm0
4290 ; SSE-NEXT: packuswb %xmm1, %xmm0
4291 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
4294 ; AVX1-LABEL: trunc_xor_const_v16i16_v16i8:
4296 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
4297 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4298 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4299 ; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4300 ; AVX1-NEXT: vzeroupper
4303 ; AVX2-LABEL: trunc_xor_const_v16i16_v16i8:
4305 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
4306 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4307 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4308 ; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4309 ; AVX2-NEXT: vzeroupper
4312 ; AVX512F-LABEL: trunc_xor_const_v16i16_v16i8:
4314 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4315 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
4316 ; AVX512F-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4317 ; AVX512F-NEXT: vzeroupper
4318 ; AVX512F-NEXT: retq
4320 ; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8:
4321 ; AVX512BW: # %bb.0:
4322 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
4323 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
4324 ; AVX512BW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4325 ; AVX512BW-NEXT: vzeroupper
4326 ; AVX512BW-NEXT: retq
4328 ; AVX512DQ-LABEL: trunc_xor_const_v16i16_v16i8:
4329 ; AVX512DQ: # %bb.0:
4330 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4331 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
4332 ; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
4333 ; AVX512DQ-NEXT: vzeroupper
4334 ; AVX512DQ-NEXT: retq
4335 %1 = xor <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
4336 %2 = trunc <16 x i16> %1 to <16 x i8>
4344 define <4 x i32> @trunc_or_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
4345 ; SSE-LABEL: trunc_or_v4i64_v4i32:
4347 ; SSE-NEXT: orps %xmm3, %xmm1
4348 ; SSE-NEXT: orps %xmm2, %xmm0
4349 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4352 ; AVX1-LABEL: trunc_or_v4i64_v4i32:
4354 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
4355 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4356 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4357 ; AVX1-NEXT: vzeroupper
4360 ; AVX2-SLOW-LABEL: trunc_or_v4i64_v4i32:
4361 ; AVX2-SLOW: # %bb.0:
4362 ; AVX2-SLOW-NEXT: vorps %ymm1, %ymm0, %ymm0
4363 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
4364 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4365 ; AVX2-SLOW-NEXT: vzeroupper
4366 ; AVX2-SLOW-NEXT: retq
4368 ; AVX2-FAST-LABEL: trunc_or_v4i64_v4i32:
4369 ; AVX2-FAST: # %bb.0:
4370 ; AVX2-FAST-NEXT: vorps %ymm1, %ymm0, %ymm0
4371 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
4372 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
4373 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
4374 ; AVX2-FAST-NEXT: vzeroupper
4375 ; AVX2-FAST-NEXT: retq
4377 ; AVX512-LABEL: trunc_or_v4i64_v4i32:
4379 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
4380 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
4381 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
4382 ; AVX512-NEXT: vzeroupper
4384 %1 = or <4 x i64> %a0, %a1
4385 %2 = trunc <4 x i64> %1 to <4 x i32>
4389 define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
4390 ; SSE-LABEL: trunc_or_v8i64_v8i16:
4392 ; SSE-NEXT: por %xmm6, %xmm2
4393 ; SSE-NEXT: por %xmm7, %xmm3
4394 ; SSE-NEXT: por %xmm4, %xmm0
4395 ; SSE-NEXT: por %xmm5, %xmm1
4396 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4397 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
4398 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4399 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
4400 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
4401 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
4402 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
4403 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
4404 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
4405 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4406 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
4409 ; AVX1-LABEL: trunc_or_v8i64_v8i16:
4411 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
4412 ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
4413 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
4414 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
4415 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
4416 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
4417 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
4418 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
4419 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
4420 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
4421 ; AVX1-NEXT: vzeroupper
4424 ; AVX2-SLOW-LABEL: trunc_or_v8i64_v8i16:
4425 ; AVX2-SLOW: # %bb.0:
4426 ; AVX2-SLOW-NEXT: vorps %ymm2, %ymm0, %ymm0
4427 ; AVX2-SLOW-NEXT: vorps %ymm3, %ymm1, %ymm1
4428 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
4429 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
4430 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
4431 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
4432 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
4433 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4434 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4435 ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
4436 ; AVX2-SLOW-NEXT: vzeroupper
4437 ; AVX2-SLOW-NEXT: retq
4439 ; AVX2-FAST-LABEL: trunc_or_v8i64_v8i16:
4440 ; AVX2-FAST: # %bb.0:
4441 ; AVX2-FAST-NEXT: vpor %ymm3, %ymm1, %ymm1
4442 ; AVX2-FAST-NEXT: vpor %ymm2, %ymm0, %ymm0
4443 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
4444 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
4445 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
4446 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
4447 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4448 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4449 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
4450 ; AVX2-FAST-NEXT: vzeroupper
4451 ; AVX2-FAST-NEXT: retq
4453 ; AVX512-LABEL: trunc_or_v8i64_v8i16:
4455 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
4456 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
4457 ; AVX512-NEXT: vzeroupper
4459 %1 = or <8 x i64> %a0, %a1
4460 %2 = trunc <8 x i64> %1 to <8 x i16>
4464 define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
4465 ; SSE-LABEL: trunc_or_v8i32_v8i16:
4467 ; SSE-NEXT: por %xmm2, %xmm0
4468 ; SSE-NEXT: por %xmm3, %xmm1
4469 ; SSE-NEXT: pslld $16, %xmm1
4470 ; SSE-NEXT: psrad $16, %xmm1
4471 ; SSE-NEXT: pslld $16, %xmm0
4472 ; SSE-NEXT: psrad $16, %xmm0
4473 ; SSE-NEXT: packssdw %xmm1, %xmm0
4476 ; AVX1-LABEL: trunc_or_v8i32_v8i16:
4478 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
4479 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4480 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
4481 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
4482 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
4483 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4484 ; AVX1-NEXT: vzeroupper
4487 ; AVX2-LABEL: trunc_or_v8i32_v8i16:
4489 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
4490 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4491 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4492 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
4493 ; AVX2-NEXT: vzeroupper
4496 ; AVX512-LABEL: trunc_or_v8i32_v8i16:
4498 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
4499 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
4500 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
4501 ; AVX512-NEXT: vzeroupper
4503 %1 = or <8 x i32> %a0, %a1
4504 %2 = trunc <8 x i32> %1 to <8 x i16>
4508 define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
4509 ; SSE-LABEL: trunc_or_v16i64_v16i8:
4511 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm0
4512 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm1
4513 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm2
4514 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm3
4515 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm4
4516 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm5
4517 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm6
4518 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm7
4519 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4520 ; SSE-NEXT: pand %xmm8, %xmm7
4521 ; SSE-NEXT: pand %xmm8, %xmm6
4522 ; SSE-NEXT: packuswb %xmm7, %xmm6
4523 ; SSE-NEXT: pand %xmm8, %xmm5
4524 ; SSE-NEXT: pand %xmm8, %xmm4
4525 ; SSE-NEXT: packuswb %xmm5, %xmm4
4526 ; SSE-NEXT: packuswb %xmm6, %xmm4
4527 ; SSE-NEXT: pand %xmm8, %xmm3
4528 ; SSE-NEXT: pand %xmm8, %xmm2
4529 ; SSE-NEXT: packuswb %xmm3, %xmm2
4530 ; SSE-NEXT: pand %xmm8, %xmm1
4531 ; SSE-NEXT: pand %xmm8, %xmm0
4532 ; SSE-NEXT: packuswb %xmm1, %xmm0
4533 ; SSE-NEXT: packuswb %xmm2, %xmm0
4534 ; SSE-NEXT: packuswb %xmm4, %xmm0
4537 ; AVX1-LABEL: trunc_or_v16i64_v16i8:
4539 ; AVX1-NEXT: vorps %ymm4, %ymm0, %ymm0
4540 ; AVX1-NEXT: vorps %ymm5, %ymm1, %ymm1
4541 ; AVX1-NEXT: vorps %ymm6, %ymm2, %ymm2
4542 ; AVX1-NEXT: vorps %ymm7, %ymm3, %ymm3
4543 ; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
4544 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
4545 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
4546 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
4547 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
4548 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
4549 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
4550 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4551 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
4552 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
4553 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
4554 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
4555 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
4556 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
4557 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
4558 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
4559 ; AVX1-NEXT: vzeroupper
4562 ; AVX2-SLOW-LABEL: trunc_or_v16i64_v16i8:
4563 ; AVX2-SLOW: # %bb.0:
4564 ; AVX2-SLOW-NEXT: vorps %ymm4, %ymm0, %ymm0
4565 ; AVX2-SLOW-NEXT: vorps %ymm5, %ymm1, %ymm1
4566 ; AVX2-SLOW-NEXT: vorps %ymm6, %ymm2, %ymm2
4567 ; AVX2-SLOW-NEXT: vorps %ymm7, %ymm3, %ymm3
4568 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
4569 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
4570 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
4571 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
4572 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
4573 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4574 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
4575 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
4576 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
4577 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
4578 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
4579 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
4580 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
4581 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
4582 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
4583 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
4584 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4585 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
4586 ; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
4587 ; AVX2-SLOW-NEXT: vzeroupper
4588 ; AVX2-SLOW-NEXT: retq
4590 ; AVX2-FAST-LABEL: trunc_or_v16i64_v16i8:
4591 ; AVX2-FAST: # %bb.0:
4592 ; AVX2-FAST-NEXT: vpor %ymm5, %ymm1, %ymm1
4593 ; AVX2-FAST-NEXT: vpor %ymm4, %ymm0, %ymm0
4594 ; AVX2-FAST-NEXT: vpor %ymm7, %ymm3, %ymm3
4595 ; AVX2-FAST-NEXT: vpor %ymm6, %ymm2, %ymm2
4596 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
4597 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
4598 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
4599 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
4600 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4601 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
4602 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
4603 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
4604 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2
4605 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
4606 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
4607 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
4608 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
4609 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4610 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0
4611 ; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
4612 ; AVX2-FAST-NEXT: vzeroupper
4613 ; AVX2-FAST-NEXT: retq
4615 ; AVX512-LABEL: trunc_or_v16i64_v16i8:
4617 ; AVX512-NEXT: vporq %zmm3, %zmm1, %zmm1
4618 ; AVX512-NEXT: vporq %zmm2, %zmm0, %zmm0
4619 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
4620 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1
4621 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4622 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
4623 ; AVX512-NEXT: vzeroupper
4625 %1 = or <16 x i64> %a0, %a1
4626 %2 = trunc <16 x i64> %1 to <16 x i8>
4630 define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
4631 ; SSE-LABEL: trunc_or_v16i32_v16i8:
4633 ; SSE-NEXT: por %xmm4, %xmm0
4634 ; SSE-NEXT: por %xmm5, %xmm1
4635 ; SSE-NEXT: por %xmm6, %xmm2
4636 ; SSE-NEXT: por %xmm7, %xmm3
4637 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4638 ; SSE-NEXT: pand %xmm4, %xmm3
4639 ; SSE-NEXT: pand %xmm4, %xmm2
4640 ; SSE-NEXT: packuswb %xmm3, %xmm2
4641 ; SSE-NEXT: pand %xmm4, %xmm1
4642 ; SSE-NEXT: pand %xmm4, %xmm0
4643 ; SSE-NEXT: packuswb %xmm1, %xmm0
4644 ; SSE-NEXT: packuswb %xmm2, %xmm0
4647 ; AVX1-LABEL: trunc_or_v16i32_v16i8:
4649 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
4650 ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
4651 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
4652 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
4653 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
4654 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
4655 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
4656 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
4657 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
4658 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4659 ; AVX1-NEXT: vzeroupper
4662 ; AVX2-LABEL: trunc_or_v16i32_v16i8:
4664 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
4665 ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
4666 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4667 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
4668 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
4669 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
4670 ; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
4671 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
4672 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4673 ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
4674 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4675 ; AVX2-NEXT: vzeroupper
4678 ; AVX512-LABEL: trunc_or_v16i32_v16i8:
4680 ; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0
4681 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
4682 ; AVX512-NEXT: vzeroupper
4684 %1 = or <16 x i32> %a0, %a1
4685 %2 = trunc <16 x i32> %1 to <16 x i8>
4689 define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
4690 ; SSE-LABEL: trunc_or_v16i16_v16i8:
4692 ; SSE-NEXT: por %xmm2, %xmm0
4693 ; SSE-NEXT: por %xmm3, %xmm1
4694 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
4695 ; SSE-NEXT: pand %xmm2, %xmm1
4696 ; SSE-NEXT: pand %xmm2, %xmm0
4697 ; SSE-NEXT: packuswb %xmm1, %xmm0
4700 ; AVX1-LABEL: trunc_or_v16i16_v16i8:
4702 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
4703 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
4704 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4705 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4706 ; AVX1-NEXT: vzeroupper
4709 ; AVX2-LABEL: trunc_or_v16i16_v16i8:
4711 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
4712 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
4713 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4714 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4715 ; AVX2-NEXT: vzeroupper
4718 ; AVX512F-LABEL: trunc_or_v16i16_v16i8:
4720 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
4721 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4722 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
4723 ; AVX512F-NEXT: vzeroupper
4724 ; AVX512F-NEXT: retq
4726 ; AVX512BW-LABEL: trunc_or_v16i16_v16i8:
4727 ; AVX512BW: # %bb.0:
4728 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
4729 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
4730 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
4731 ; AVX512BW-NEXT: vzeroupper
4732 ; AVX512BW-NEXT: retq
4734 ; AVX512DQ-LABEL: trunc_or_v16i16_v16i8:
4735 ; AVX512DQ: # %bb.0:
4736 ; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0
4737 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4738 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
4739 ; AVX512DQ-NEXT: vzeroupper
4740 ; AVX512DQ-NEXT: retq
4741 %1 = or <16 x i16> %a0, %a1
4742 %2 = trunc <16 x i16> %1 to <16 x i8>
4750 define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
4751 ; SSE-LABEL: trunc_or_const_v4i64_v4i32:
4753 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4754 ; SSE-NEXT: orps {{.*}}(%rip), %xmm0
4757 ; AVX1-LABEL: trunc_or_const_v4i64_v4i32:
4759 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4760 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4761 ; AVX1-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0
4762 ; AVX1-NEXT: vzeroupper
4765 ; AVX2-SLOW-LABEL: trunc_or_const_v4i64_v4i32:
4766 ; AVX2-SLOW: # %bb.0:
4767 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
4768 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4769 ; AVX2-SLOW-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0
4770 ; AVX2-SLOW-NEXT: vzeroupper
4771 ; AVX2-SLOW-NEXT: retq
4773 ; AVX2-FAST-LABEL: trunc_or_const_v4i64_v4i32:
4774 ; AVX2-FAST: # %bb.0:
4775 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
4776 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
4777 ; AVX2-FAST-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0
4778 ; AVX2-FAST-NEXT: vzeroupper
4779 ; AVX2-FAST-NEXT: retq
4781 ; AVX512-LABEL: trunc_or_const_v4i64_v4i32:
4783 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
4784 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
4785 ; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
4786 ; AVX512-NEXT: vzeroupper
4788 %1 = or <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
4789 %2 = trunc <4 x i64> %1 to <4 x i32>
4793 define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
4794 ; SSE-LABEL: trunc_or_const_v8i64_v8i16:
4796 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4797 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
4798 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4799 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
4800 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
4801 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
4802 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
4803 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
4804 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
4805 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4806 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
4807 ; SSE-NEXT: orpd {{.*}}(%rip), %xmm0
4810 ; AVX1-LABEL: trunc_or_const_v8i64_v8i16:
4812 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
4813 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
4814 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
4815 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
4816 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
4817 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
4818 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
4819 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
4820 ; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
4821 ; AVX1-NEXT: vzeroupper
4824 ; AVX2-SLOW-LABEL: trunc_or_const_v8i64_v8i16:
4825 ; AVX2-SLOW: # %bb.0:
4826 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
4827 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
4828 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
4829 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
4830 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
4831 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4832 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4833 ; AVX2-SLOW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
4834 ; AVX2-SLOW-NEXT: vzeroupper
4835 ; AVX2-SLOW-NEXT: retq
4837 ; AVX2-FAST-LABEL: trunc_or_const_v8i64_v8i16:
4838 ; AVX2-FAST: # %bb.0:
4839 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
4840 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
4841 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
4842 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
4843 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4844 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4845 ; AVX2-FAST-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
4846 ; AVX2-FAST-NEXT: vzeroupper
4847 ; AVX2-FAST-NEXT: retq
4849 ; AVX512-LABEL: trunc_or_const_v8i64_v8i16:
4851 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
4852 ; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
4853 ; AVX512-NEXT: vzeroupper
4855 %1 = or <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
4856 %2 = trunc <8 x i64> %1 to <8 x i16>
4860 define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
4861 ; SSE-LABEL: trunc_or_const_v8i32_v8i16:
4863 ; SSE-NEXT: pslld $16, %xmm1
4864 ; SSE-NEXT: psrad $16, %xmm1
4865 ; SSE-NEXT: pslld $16, %xmm0
4866 ; SSE-NEXT: psrad $16, %xmm0
4867 ; SSE-NEXT: packssdw %xmm1, %xmm0
4868 ; SSE-NEXT: por {{.*}}(%rip), %xmm0
4871 ; AVX1-LABEL: trunc_or_const_v8i32_v8i16:
4873 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4874 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
4875 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
4876 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
4877 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4878 ; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
4879 ; AVX1-NEXT: vzeroupper
4882 ; AVX2-LABEL: trunc_or_const_v8i32_v8i16:
4884 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4885 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4886 ; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
4887 ; AVX2-NEXT: vzeroupper
4890 ; AVX512-LABEL: trunc_or_const_v8i32_v8i16:
4892 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
4893 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
4894 ; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
4895 ; AVX512-NEXT: vzeroupper
4897 %1 = or <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
4898 %2 = trunc <8 x i32> %1 to <8 x i16>
4902 define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
4903 ; SSE-LABEL: trunc_or_const_v16i64_v16i8:
4905 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4906 ; SSE-NEXT: pand %xmm8, %xmm7
4907 ; SSE-NEXT: pand %xmm8, %xmm6
4908 ; SSE-NEXT: packuswb %xmm7, %xmm6
4909 ; SSE-NEXT: pand %xmm8, %xmm5
4910 ; SSE-NEXT: pand %xmm8, %xmm4
4911 ; SSE-NEXT: packuswb %xmm5, %xmm4
4912 ; SSE-NEXT: packuswb %xmm6, %xmm4
4913 ; SSE-NEXT: pand %xmm8, %xmm3
4914 ; SSE-NEXT: pand %xmm8, %xmm2
4915 ; SSE-NEXT: packuswb %xmm3, %xmm2
4916 ; SSE-NEXT: pand %xmm8, %xmm1
4917 ; SSE-NEXT: pand %xmm8, %xmm0
4918 ; SSE-NEXT: packuswb %xmm1, %xmm0
4919 ; SSE-NEXT: packuswb %xmm2, %xmm0
4920 ; SSE-NEXT: packuswb %xmm4, %xmm0
4921 ; SSE-NEXT: por {{.*}}(%rip), %xmm0
4924 ; AVX1-LABEL: trunc_or_const_v16i64_v16i8:
4926 ; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
4927 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
4928 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
4929 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
4930 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
4931 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
4932 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
4933 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4934 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
4935 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
4936 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
4937 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
4938 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
4939 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
4940 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
4941 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
4942 ; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
4943 ; AVX1-NEXT: vzeroupper
4946 ; AVX2-SLOW-LABEL: trunc_or_const_v16i64_v16i8:
4947 ; AVX2-SLOW: # %bb.0:
4948 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
4949 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
4950 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
4951 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
4952 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
4953 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4954 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
4955 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
4956 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
4957 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
4958 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
4959 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
4960 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
4961 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
4962 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
4963 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
4964 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4965 ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
4966 ; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
4967 ; AVX2-SLOW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
4968 ; AVX2-SLOW-NEXT: vzeroupper
4969 ; AVX2-SLOW-NEXT: retq
4971 ; AVX2-FAST-LABEL: trunc_or_const_v16i64_v16i8:
4972 ; AVX2-FAST: # %bb.0:
4973 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
4974 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
4975 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
4976 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
4977 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4978 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
4979 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
4980 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
4981 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2
4982 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
4983 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
4984 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
4985 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
4986 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4987 ; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0
4988 ; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
4989 ; AVX2-FAST-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
4990 ; AVX2-FAST-NEXT: vzeroupper
4991 ; AVX2-FAST-NEXT: retq
4993 ; AVX512-LABEL: trunc_or_const_v16i64_v16i8:
4995 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
4996 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1
4997 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4998 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
4999 ; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
5000 ; AVX512-NEXT: vzeroupper
5002 %1 = or <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
5003 %2 = trunc <16 x i64> %1 to <16 x i8>
5007 define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
5008 ; SSE-LABEL: trunc_or_const_v16i32_v16i8:
5010 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
5011 ; SSE-NEXT: pand %xmm4, %xmm3
5012 ; SSE-NEXT: pand %xmm4, %xmm2
5013 ; SSE-NEXT: packuswb %xmm3, %xmm2
5014 ; SSE-NEXT: pand %xmm4, %xmm1
5015 ; SSE-NEXT: pand %xmm4, %xmm0
5016 ; SSE-NEXT: packuswb %xmm1, %xmm0
5017 ; SSE-NEXT: packuswb %xmm2, %xmm0
5018 ; SSE-NEXT: por {{.*}}(%rip), %xmm0
5021 ; AVX1-LABEL: trunc_or_const_v16i32_v16i8:
5023 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
5024 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
5025 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
5026 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
5027 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
5028 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
5029 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
5030 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
5031 ; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
5032 ; AVX1-NEXT: vzeroupper
5035 ; AVX2-LABEL: trunc_or_const_v16i32_v16i8:
5037 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
5038 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
5039 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
5040 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
5041 ; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
5042 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
5043 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
5044 ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
5045 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
5046 ; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
5047 ; AVX2-NEXT: vzeroupper
5050 ; AVX512-LABEL: trunc_or_const_v16i32_v16i8:
5052 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
5053 ; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
5054 ; AVX512-NEXT: vzeroupper
5056 %1 = or <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
5057 %2 = trunc <16 x i32> %1 to <16 x i8>
5061 define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
5062 ; SSE-LABEL: trunc_or_const_v16i16_v16i8:
5064 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
5065 ; SSE-NEXT: pand %xmm2, %xmm1
5066 ; SSE-NEXT: pand %xmm2, %xmm0
5067 ; SSE-NEXT: packuswb %xmm1, %xmm0
5068 ; SSE-NEXT: por {{.*}}(%rip), %xmm0
5071 ; AVX1-LABEL: trunc_or_const_v16i16_v16i8:
5073 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
5074 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
5075 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
5076 ; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
5077 ; AVX1-NEXT: vzeroupper
5080 ; AVX2-LABEL: trunc_or_const_v16i16_v16i8:
5082 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
5083 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
5084 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
5085 ; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
5086 ; AVX2-NEXT: vzeroupper
5089 ; AVX512F-LABEL: trunc_or_const_v16i16_v16i8:
5091 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
5092 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
5093 ; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
5094 ; AVX512F-NEXT: vzeroupper
5095 ; AVX512F-NEXT: retq
5097 ; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8:
5098 ; AVX512BW: # %bb.0:
5099 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
5100 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
5101 ; AVX512BW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
5102 ; AVX512BW-NEXT: vzeroupper
5103 ; AVX512BW-NEXT: retq
5105 ; AVX512DQ-LABEL: trunc_or_const_v16i16_v16i8:
5106 ; AVX512DQ: # %bb.0:
5107 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
5108 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
5109 ; AVX512DQ-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
5110 ; AVX512DQ-NEXT: vzeroupper
5111 ; AVX512DQ-NEXT: retq
5112 %1 = or <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
5113 %2 = trunc <16 x i16> %1 to <16 x i8>
5118 ; complex patterns - often created by vectorizer
5121 define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
5122 ; SSE-LABEL: mul_add_const_v4i64_v4i32:
5124 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
5125 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
5126 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
5127 ; SSE-NEXT: pmuludq %xmm2, %xmm0
5128 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
5129 ; SSE-NEXT: pmuludq %xmm3, %xmm1
5130 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
5131 ; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
5134 ; AVX-LABEL: mul_add_const_v4i64_v4i32:
5136 ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
5137 ; AVX-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
5139 %1 = sext <4 x i32> %a0 to <4 x i64>
5140 %2 = sext <4 x i32> %a1 to <4 x i64>
5141 %3 = mul <4 x i64> %1, %2
5142 %4 = add <4 x i64> %3, <i64 -3, i64 -1, i64 1, i64 3>
5143 %5 = trunc <4 x i64> %4 to <4 x i32>
5147 define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
5148 ; SSE-LABEL: mul_add_self_v4i64_v4i32:
5150 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
5151 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
5152 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
5153 ; SSE-NEXT: pmuludq %xmm2, %xmm0
5154 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
5155 ; SSE-NEXT: pmuludq %xmm3, %xmm1
5156 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
5157 ; SSE-NEXT: paddd %xmm0, %xmm0
5160 ; AVX-LABEL: mul_add_self_v4i64_v4i32:
5162 ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
5163 ; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0
5165 %1 = sext <4 x i32> %a0 to <4 x i64>
5166 %2 = sext <4 x i32> %a1 to <4 x i64>
5167 %3 = mul <4 x i64> %1, %2
5168 %4 = add <4 x i64> %3, %3
5169 %5 = trunc <4 x i64> %4 to <4 x i32>
5173 define <4 x i32> @mul_add_multiuse_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
5174 ; SSE-LABEL: mul_add_multiuse_v4i64_v4i32:
5176 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
5177 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
5178 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3]
5179 ; SSE-NEXT: pmuludq %xmm2, %xmm4
5180 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
5181 ; SSE-NEXT: pmuludq %xmm3, %xmm1
5182 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[0,2]
5183 ; SSE-NEXT: paddd %xmm4, %xmm0
5186 ; AVX-LABEL: mul_add_multiuse_v4i64_v4i32:
5188 ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm1
5189 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
5191 %1 = sext <4 x i32> %a0 to <4 x i64>
5192 %2 = sext <4 x i32> %a1 to <4 x i64>
5193 %3 = mul <4 x i64> %1, %2
5194 %4 = add <4 x i64> %1, %3
5195 %5 = trunc <4 x i64> %4 to <4 x i32>