1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-ALL
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-PERLANE
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ
17 define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
18 ; SSE-LABEL: trunc_add_v4i64_v4i32:
20 ; SSE-NEXT: paddq %xmm3, %xmm1
21 ; SSE-NEXT: paddq %xmm2, %xmm0
22 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
25 ; AVX1-LABEL: trunc_add_v4i64_v4i32:
27 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
28 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
29 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
30 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
31 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
32 ; AVX1-NEXT: vzeroupper
35 ; AVX2-SLOW-LABEL: trunc_add_v4i64_v4i32:
37 ; AVX2-SLOW-NEXT: vpaddq %ymm1, %ymm0, %ymm0
38 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
39 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
40 ; AVX2-SLOW-NEXT: vzeroupper
41 ; AVX2-SLOW-NEXT: retq
43 ; AVX2-FAST-ALL-LABEL: trunc_add_v4i64_v4i32:
44 ; AVX2-FAST-ALL: # %bb.0:
45 ; AVX2-FAST-ALL-NEXT: vpaddq %ymm1, %ymm0, %ymm0
46 ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
47 ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
48 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
49 ; AVX2-FAST-ALL-NEXT: vzeroupper
50 ; AVX2-FAST-ALL-NEXT: retq
52 ; AVX2-FAST-PERLANE-LABEL: trunc_add_v4i64_v4i32:
53 ; AVX2-FAST-PERLANE: # %bb.0:
54 ; AVX2-FAST-PERLANE-NEXT: vpaddq %ymm1, %ymm0, %ymm0
55 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1
56 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
57 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
58 ; AVX2-FAST-PERLANE-NEXT: retq
60 ; AVX512-LABEL: trunc_add_v4i64_v4i32:
62 ; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0
63 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
64 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
65 ; AVX512-NEXT: vzeroupper
67 %1 = add <4 x i64> %a0, %a1
68 %2 = trunc <4 x i64> %1 to <4 x i32>
72 define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
73 ; SSE-LABEL: trunc_add_v8i64_v8i16:
75 ; SSE-NEXT: paddq %xmm6, %xmm2
76 ; SSE-NEXT: paddq %xmm7, %xmm3
77 ; SSE-NEXT: paddq %xmm4, %xmm0
78 ; SSE-NEXT: paddq %xmm5, %xmm1
79 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
80 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
81 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
82 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
83 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
84 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
85 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
86 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
87 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
88 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
89 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
92 ; AVX1-LABEL: trunc_add_v8i64_v8i16:
94 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4
95 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
96 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
97 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
98 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm2
99 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
100 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
101 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
102 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
103 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
104 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
105 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
106 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
107 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
108 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
109 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
110 ; AVX1-NEXT: vzeroupper
113 ; AVX2-LABEL: trunc_add_v8i64_v8i16:
115 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
116 ; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
117 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
118 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
119 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
120 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
121 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
122 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
123 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
124 ; AVX2-NEXT: vzeroupper
127 ; AVX512-LABEL: trunc_add_v8i64_v8i16:
129 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
130 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
131 ; AVX512-NEXT: vzeroupper
133 %1 = add <8 x i64> %a0, %a1
134 %2 = trunc <8 x i64> %1 to <8 x i16>
138 define <8 x i16> @trunc_add_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
139 ; SSE-LABEL: trunc_add_v8i32_v8i16:
141 ; SSE-NEXT: paddd %xmm2, %xmm0
142 ; SSE-NEXT: paddd %xmm3, %xmm1
143 ; SSE-NEXT: pslld $16, %xmm1
144 ; SSE-NEXT: psrad $16, %xmm1
145 ; SSE-NEXT: pslld $16, %xmm0
146 ; SSE-NEXT: psrad $16, %xmm0
147 ; SSE-NEXT: packssdw %xmm1, %xmm0
150 ; AVX1-LABEL: trunc_add_v8i32_v8i16:
152 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2
153 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
154 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
155 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
156 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
157 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
158 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
159 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
160 ; AVX1-NEXT: vzeroupper
163 ; AVX2-LABEL: trunc_add_v8i32_v8i16:
165 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
166 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
167 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
168 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
169 ; AVX2-NEXT: vzeroupper
172 ; AVX512-LABEL: trunc_add_v8i32_v8i16:
174 ; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
175 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
176 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
177 ; AVX512-NEXT: vzeroupper
179 %1 = add <8 x i32> %a0, %a1
180 %2 = trunc <8 x i32> %1 to <8 x i16>
184 define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
185 ; SSE-LABEL: trunc_add_v16i64_v16i8:
187 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm0
188 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm1
189 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm2
190 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm3
191 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm4
192 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm5
193 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm6
194 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm7
195 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
196 ; SSE-NEXT: pand %xmm8, %xmm7
197 ; SSE-NEXT: pand %xmm8, %xmm6
198 ; SSE-NEXT: packuswb %xmm7, %xmm6
199 ; SSE-NEXT: pand %xmm8, %xmm5
200 ; SSE-NEXT: pand %xmm8, %xmm4
201 ; SSE-NEXT: packuswb %xmm5, %xmm4
202 ; SSE-NEXT: packuswb %xmm6, %xmm4
203 ; SSE-NEXT: pand %xmm8, %xmm3
204 ; SSE-NEXT: pand %xmm8, %xmm2
205 ; SSE-NEXT: packuswb %xmm3, %xmm2
206 ; SSE-NEXT: pand %xmm8, %xmm1
207 ; SSE-NEXT: pand %xmm8, %xmm0
208 ; SSE-NEXT: packuswb %xmm1, %xmm0
209 ; SSE-NEXT: packuswb %xmm2, %xmm0
210 ; SSE-NEXT: packuswb %xmm4, %xmm0
213 ; AVX1-LABEL: trunc_add_v16i64_v16i8:
215 ; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm8
216 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
217 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
218 ; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0
219 ; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm4
220 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
221 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
222 ; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1
223 ; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm5
224 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6
225 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
226 ; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm2
227 ; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm6
228 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7
229 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
230 ; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm3
231 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255]
232 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
233 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
234 ; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3
235 ; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
236 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
237 ; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2
238 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
239 ; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1
240 ; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3
241 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
242 ; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0
243 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3
244 ; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0
245 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
246 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
247 ; AVX1-NEXT: vzeroupper
250 ; AVX2-LABEL: trunc_add_v16i64_v16i8:
252 ; AVX2-NEXT: vpaddq %ymm4, %ymm0, %ymm0
253 ; AVX2-NEXT: vpaddq %ymm5, %ymm1, %ymm1
254 ; AVX2-NEXT: vpaddq %ymm6, %ymm2, %ymm2
255 ; AVX2-NEXT: vpaddq %ymm7, %ymm3, %ymm3
256 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
257 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
258 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
259 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
260 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
261 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
262 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
263 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
264 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
265 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
266 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
267 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
268 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
269 ; AVX2-NEXT: vzeroupper
272 ; AVX512-LABEL: trunc_add_v16i64_v16i8:
274 ; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0
275 ; AVX512-NEXT: vpaddq %zmm3, %zmm1, %zmm1
276 ; AVX512-NEXT: vpmovqb %zmm1, %xmm1
277 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
278 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
279 ; AVX512-NEXT: vzeroupper
281 %1 = add <16 x i64> %a0, %a1
282 %2 = trunc <16 x i64> %1 to <16 x i8>
286 define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
287 ; SSE-LABEL: trunc_add_v16i32_v16i8:
289 ; SSE-NEXT: paddd %xmm4, %xmm0
290 ; SSE-NEXT: paddd %xmm5, %xmm1
291 ; SSE-NEXT: paddd %xmm6, %xmm2
292 ; SSE-NEXT: paddd %xmm7, %xmm3
293 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
294 ; SSE-NEXT: pand %xmm4, %xmm3
295 ; SSE-NEXT: pand %xmm4, %xmm2
296 ; SSE-NEXT: packuswb %xmm3, %xmm2
297 ; SSE-NEXT: pand %xmm4, %xmm1
298 ; SSE-NEXT: pand %xmm4, %xmm0
299 ; SSE-NEXT: packuswb %xmm1, %xmm0
300 ; SSE-NEXT: packuswb %xmm2, %xmm0
303 ; AVX1-LABEL: trunc_add_v16i32_v16i8:
305 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4
306 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
307 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
308 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
309 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm2
310 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
311 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
312 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
313 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255]
314 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
315 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
316 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
317 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
318 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2
319 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
320 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
321 ; AVX1-NEXT: vzeroupper
324 ; AVX2-LABEL: trunc_add_v16i32_v16i8:
326 ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
327 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1
328 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
329 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
330 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
331 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
332 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
333 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
334 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
335 ; AVX2-NEXT: vzeroupper
338 ; AVX512-LABEL: trunc_add_v16i32_v16i8:
340 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
341 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
342 ; AVX512-NEXT: vzeroupper
344 %1 = add <16 x i32> %a0, %a1
345 %2 = trunc <16 x i32> %1 to <16 x i8>
349 define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
350 ; SSE-LABEL: trunc_add_v16i16_v16i8:
352 ; SSE-NEXT: paddw %xmm2, %xmm0
353 ; SSE-NEXT: paddw %xmm3, %xmm1
354 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
355 ; SSE-NEXT: pand %xmm2, %xmm1
356 ; SSE-NEXT: pand %xmm2, %xmm0
357 ; SSE-NEXT: packuswb %xmm1, %xmm0
360 ; AVX1-LABEL: trunc_add_v16i16_v16i8:
362 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2
363 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
364 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
365 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
366 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
367 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
368 ; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1
369 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
370 ; AVX1-NEXT: vzeroupper
373 ; AVX2-LABEL: trunc_add_v16i16_v16i8:
375 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
376 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
377 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
378 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
379 ; AVX2-NEXT: vzeroupper
382 ; AVX512F-LABEL: trunc_add_v16i16_v16i8:
384 ; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm0
385 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
386 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
387 ; AVX512F-NEXT: vzeroupper
390 ; AVX512BW-LABEL: trunc_add_v16i16_v16i8:
392 ; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm0
393 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
394 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
395 ; AVX512BW-NEXT: vzeroupper
396 ; AVX512BW-NEXT: retq
398 ; AVX512DQ-LABEL: trunc_add_v16i16_v16i8:
400 ; AVX512DQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0
401 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
402 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
403 ; AVX512DQ-NEXT: vzeroupper
404 ; AVX512DQ-NEXT: retq
405 %1 = add <16 x i16> %a0, %a1
406 %2 = trunc <16 x i16> %1 to <16 x i8>
410 define <8 x i16> @trunc_add_v8i32_v8i16_sext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
411 ; SSE-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
413 ; SSE-NEXT: pslld $16, %xmm2
414 ; SSE-NEXT: psrad $16, %xmm2
415 ; SSE-NEXT: pslld $16, %xmm1
416 ; SSE-NEXT: psrad $16, %xmm1
417 ; SSE-NEXT: packssdw %xmm2, %xmm1
418 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
419 ; SSE-NEXT: psraw $8, %xmm0
420 ; SSE-NEXT: paddw %xmm1, %xmm0
423 ; AVX1-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
425 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
426 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
427 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
428 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
429 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
430 ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0
431 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
432 ; AVX1-NEXT: vzeroupper
435 ; AVX2-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
437 ; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0
438 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
439 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
440 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
441 ; AVX2-NEXT: vzeroupper
444 ; AVX512-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
446 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
447 ; AVX512-NEXT: vpmovdw %zmm1, %ymm1
448 ; AVX512-NEXT: vpmovsxbw %xmm0, %xmm0
449 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
450 ; AVX512-NEXT: vzeroupper
452 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
453 %2 = sext <8 x i8> %1 to <8 x i32>
454 %3 = add <8 x i32> %2, %a1
455 %4 = trunc <8 x i32> %3 to <8 x i16>
463 define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
464 ; SSE-LABEL: trunc_add_const_v4i64_v4i32:
466 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
467 ; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
470 ; AVX1-LABEL: trunc_add_const_v4i64_v4i32:
472 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
473 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
474 ; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
475 ; AVX1-NEXT: vzeroupper
478 ; AVX2-SLOW-LABEL: trunc_add_const_v4i64_v4i32:
479 ; AVX2-SLOW: # %bb.0:
480 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
481 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
482 ; AVX2-SLOW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
483 ; AVX2-SLOW-NEXT: vzeroupper
484 ; AVX2-SLOW-NEXT: retq
486 ; AVX2-FAST-ALL-LABEL: trunc_add_const_v4i64_v4i32:
487 ; AVX2-FAST-ALL: # %bb.0:
488 ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
489 ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
490 ; AVX2-FAST-ALL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
491 ; AVX2-FAST-ALL-NEXT: vzeroupper
492 ; AVX2-FAST-ALL-NEXT: retq
494 ; AVX2-FAST-PERLANE-LABEL: trunc_add_const_v4i64_v4i32:
495 ; AVX2-FAST-PERLANE: # %bb.0:
496 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
497 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
498 ; AVX2-FAST-PERLANE-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
499 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
500 ; AVX2-FAST-PERLANE-NEXT: retq
502 ; AVX512-LABEL: trunc_add_const_v4i64_v4i32:
504 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
505 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
506 ; AVX512-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
507 ; AVX512-NEXT: vzeroupper
509 %1 = add <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
510 %2 = trunc <4 x i64> %1 to <4 x i32>
514 define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
515 ; SSE-LABEL: trunc_add_const_v8i64_v8i16:
517 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
518 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
519 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
520 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
521 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
522 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
523 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
524 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
525 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
526 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
527 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
528 ; SSE-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
531 ; AVX1-LABEL: trunc_add_const_v8i64_v8i16:
533 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
534 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
535 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
536 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
537 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
538 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
539 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
540 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
541 ; AVX1-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
542 ; AVX1-NEXT: vzeroupper
545 ; AVX2-LABEL: trunc_add_const_v8i64_v8i16:
547 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
548 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
549 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
550 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
551 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
552 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
553 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
554 ; AVX2-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
555 ; AVX2-NEXT: vzeroupper
558 ; AVX512-LABEL: trunc_add_const_v8i64_v8i16:
560 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
561 ; AVX512-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
562 ; AVX512-NEXT: vzeroupper
564 %1 = add <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
565 %2 = trunc <8 x i64> %1 to <8 x i16>
569 define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
570 ; SSE-LABEL: trunc_add_const_v8i32_v8i16:
572 ; SSE-NEXT: pslld $16, %xmm1
573 ; SSE-NEXT: psrad $16, %xmm1
574 ; SSE-NEXT: pslld $16, %xmm0
575 ; SSE-NEXT: psrad $16, %xmm0
576 ; SSE-NEXT: packssdw %xmm1, %xmm0
577 ; SSE-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
580 ; AVX1-LABEL: trunc_add_const_v8i32_v8i16:
582 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
583 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
584 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
585 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
586 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
587 ; AVX1-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
588 ; AVX1-NEXT: vzeroupper
591 ; AVX2-LABEL: trunc_add_const_v8i32_v8i16:
593 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
594 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
595 ; AVX2-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
596 ; AVX2-NEXT: vzeroupper
599 ; AVX512-LABEL: trunc_add_const_v8i32_v8i16:
601 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
602 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
603 ; AVX512-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
604 ; AVX512-NEXT: vzeroupper
606 %1 = add <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
607 %2 = trunc <8 x i32> %1 to <8 x i16>
611 define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
612 ; SSE-LABEL: trunc_add_const_v16i64_v16i8:
614 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
615 ; SSE-NEXT: pand %xmm8, %xmm7
616 ; SSE-NEXT: pand %xmm8, %xmm6
617 ; SSE-NEXT: packuswb %xmm7, %xmm6
618 ; SSE-NEXT: pand %xmm8, %xmm5
619 ; SSE-NEXT: pand %xmm8, %xmm4
620 ; SSE-NEXT: packuswb %xmm5, %xmm4
621 ; SSE-NEXT: packuswb %xmm6, %xmm4
622 ; SSE-NEXT: pand %xmm8, %xmm3
623 ; SSE-NEXT: pand %xmm8, %xmm2
624 ; SSE-NEXT: packuswb %xmm3, %xmm2
625 ; SSE-NEXT: pand %xmm8, %xmm1
626 ; SSE-NEXT: pand %xmm8, %xmm0
627 ; SSE-NEXT: packuswb %xmm1, %xmm0
628 ; SSE-NEXT: packuswb %xmm2, %xmm0
629 ; SSE-NEXT: packuswb %xmm4, %xmm0
630 ; SSE-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
633 ; AVX1-LABEL: trunc_add_const_v16i64_v16i8:
635 ; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
636 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
637 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
638 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
639 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
640 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
641 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
642 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
643 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
644 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
645 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
646 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
647 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
648 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
649 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
650 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
651 ; AVX1-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
652 ; AVX1-NEXT: vzeroupper
655 ; AVX2-LABEL: trunc_add_const_v16i64_v16i8:
657 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
658 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
659 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
660 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
661 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
662 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
663 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
664 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
665 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
666 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
667 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
668 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
669 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
670 ; AVX2-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
671 ; AVX2-NEXT: vzeroupper
674 ; AVX512-LABEL: trunc_add_const_v16i64_v16i8:
676 ; AVX512-NEXT: vpmovqb %zmm1, %xmm1
677 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
678 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
679 ; AVX512-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
680 ; AVX512-NEXT: vzeroupper
682 %1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
683 %2 = trunc <16 x i64> %1 to <16 x i8>
687 define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
688 ; SSE-LABEL: trunc_add_const_v16i32_v16i8:
690 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
691 ; SSE-NEXT: pand %xmm4, %xmm3
692 ; SSE-NEXT: pand %xmm4, %xmm2
693 ; SSE-NEXT: packuswb %xmm3, %xmm2
694 ; SSE-NEXT: pand %xmm4, %xmm1
695 ; SSE-NEXT: pand %xmm4, %xmm0
696 ; SSE-NEXT: packuswb %xmm1, %xmm0
697 ; SSE-NEXT: packuswb %xmm2, %xmm0
698 ; SSE-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
701 ; AVX1-LABEL: trunc_add_const_v16i32_v16i8:
703 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
704 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
705 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
706 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
707 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
708 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
709 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
710 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
711 ; AVX1-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
712 ; AVX1-NEXT: vzeroupper
715 ; AVX2-LABEL: trunc_add_const_v16i32_v16i8:
717 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
718 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
719 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
720 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
721 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
722 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
723 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
724 ; AVX2-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
725 ; AVX2-NEXT: vzeroupper
728 ; AVX512-LABEL: trunc_add_const_v16i32_v16i8:
730 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
731 ; AVX512-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
732 ; AVX512-NEXT: vzeroupper
734 %1 = add <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
735 %2 = trunc <16 x i32> %1 to <16 x i8>
739 define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
740 ; SSE-LABEL: trunc_add_const_v16i16_v16i8:
742 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
743 ; SSE-NEXT: pand %xmm2, %xmm1
744 ; SSE-NEXT: pand %xmm2, %xmm0
745 ; SSE-NEXT: packuswb %xmm1, %xmm0
746 ; SSE-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
749 ; AVX1-LABEL: trunc_add_const_v16i16_v16i8:
751 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
752 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
753 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
754 ; AVX1-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
755 ; AVX1-NEXT: vzeroupper
758 ; AVX2-LABEL: trunc_add_const_v16i16_v16i8:
760 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
761 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
762 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
763 ; AVX2-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
764 ; AVX2-NEXT: vzeroupper
767 ; AVX512F-LABEL: trunc_add_const_v16i16_v16i8:
769 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
770 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
771 ; AVX512F-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
772 ; AVX512F-NEXT: vzeroupper
775 ; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8:
777 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
778 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
779 ; AVX512BW-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
780 ; AVX512BW-NEXT: vzeroupper
781 ; AVX512BW-NEXT: retq
783 ; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8:
785 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
786 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
787 ; AVX512DQ-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
788 ; AVX512DQ-NEXT: vzeroupper
789 ; AVX512DQ-NEXT: retq
790 %1 = add <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
791 %2 = trunc <16 x i16> %1 to <16 x i8>
799 define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
800 ; SSE-LABEL: trunc_sub_v4i64_v4i32:
802 ; SSE-NEXT: psubq %xmm3, %xmm1
803 ; SSE-NEXT: psubq %xmm2, %xmm0
804 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
807 ; AVX1-LABEL: trunc_sub_v4i64_v4i32:
809 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
810 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
811 ; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
812 ; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
813 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
814 ; AVX1-NEXT: vzeroupper
817 ; AVX2-SLOW-LABEL: trunc_sub_v4i64_v4i32:
818 ; AVX2-SLOW: # %bb.0:
819 ; AVX2-SLOW-NEXT: vpsubq %ymm1, %ymm0, %ymm0
820 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
821 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
822 ; AVX2-SLOW-NEXT: vzeroupper
823 ; AVX2-SLOW-NEXT: retq
825 ; AVX2-FAST-ALL-LABEL: trunc_sub_v4i64_v4i32:
826 ; AVX2-FAST-ALL: # %bb.0:
827 ; AVX2-FAST-ALL-NEXT: vpsubq %ymm1, %ymm0, %ymm0
828 ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
829 ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
830 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
831 ; AVX2-FAST-ALL-NEXT: vzeroupper
832 ; AVX2-FAST-ALL-NEXT: retq
834 ; AVX2-FAST-PERLANE-LABEL: trunc_sub_v4i64_v4i32:
835 ; AVX2-FAST-PERLANE: # %bb.0:
836 ; AVX2-FAST-PERLANE-NEXT: vpsubq %ymm1, %ymm0, %ymm0
837 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1
838 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
839 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
840 ; AVX2-FAST-PERLANE-NEXT: retq
842 ; AVX512-LABEL: trunc_sub_v4i64_v4i32:
844 ; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0
845 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
846 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
847 ; AVX512-NEXT: vzeroupper
849 %1 = sub <4 x i64> %a0, %a1
850 %2 = trunc <4 x i64> %1 to <4 x i32>
854 define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
855 ; SSE-LABEL: trunc_sub_v8i64_v8i16:
857 ; SSE-NEXT: psubq %xmm6, %xmm2
858 ; SSE-NEXT: psubq %xmm7, %xmm3
859 ; SSE-NEXT: psubq %xmm4, %xmm0
860 ; SSE-NEXT: psubq %xmm5, %xmm1
861 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
862 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
863 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
864 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
865 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
866 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
867 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
868 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
869 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
870 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
871 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
874 ; AVX1-LABEL: trunc_sub_v8i64_v8i16:
876 ; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm4
877 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
878 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
879 ; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
880 ; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm2
881 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
882 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
883 ; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1
884 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
885 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
886 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
887 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
888 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
889 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
890 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
891 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
892 ; AVX1-NEXT: vzeroupper
895 ; AVX2-LABEL: trunc_sub_v8i64_v8i16:
897 ; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
898 ; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm1
899 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
900 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
901 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
902 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
903 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
904 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
905 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
906 ; AVX2-NEXT: vzeroupper
909 ; AVX512-LABEL: trunc_sub_v8i64_v8i16:
911 ; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0
912 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
913 ; AVX512-NEXT: vzeroupper
915 %1 = sub <8 x i64> %a0, %a1
916 %2 = trunc <8 x i64> %1 to <8 x i16>
920 define <8 x i16> @trunc_sub_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
921 ; SSE-LABEL: trunc_sub_v8i32_v8i16:
923 ; SSE-NEXT: psubd %xmm2, %xmm0
924 ; SSE-NEXT: psubd %xmm3, %xmm1
925 ; SSE-NEXT: pslld $16, %xmm1
926 ; SSE-NEXT: psrad $16, %xmm1
927 ; SSE-NEXT: pslld $16, %xmm0
928 ; SSE-NEXT: psrad $16, %xmm0
929 ; SSE-NEXT: packssdw %xmm1, %xmm0
932 ; AVX1-LABEL: trunc_sub_v8i32_v8i16:
934 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2
935 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
936 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
937 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
938 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
939 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
940 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
941 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
942 ; AVX1-NEXT: vzeroupper
945 ; AVX2-LABEL: trunc_sub_v8i32_v8i16:
947 ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
948 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
949 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
950 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
951 ; AVX2-NEXT: vzeroupper
954 ; AVX512-LABEL: trunc_sub_v8i32_v8i16:
956 ; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0
957 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
958 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
959 ; AVX512-NEXT: vzeroupper
961 %1 = sub <8 x i32> %a0, %a1
962 %2 = trunc <8 x i32> %1 to <8 x i16>
966 define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
967 ; SSE-LABEL: trunc_sub_v16i64_v16i8:
969 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm0
970 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm1
971 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm2
972 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm3
973 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm4
974 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm5
975 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm6
976 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm7
977 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
978 ; SSE-NEXT: pand %xmm8, %xmm7
979 ; SSE-NEXT: pand %xmm8, %xmm6
980 ; SSE-NEXT: packuswb %xmm7, %xmm6
981 ; SSE-NEXT: pand %xmm8, %xmm5
982 ; SSE-NEXT: pand %xmm8, %xmm4
983 ; SSE-NEXT: packuswb %xmm5, %xmm4
984 ; SSE-NEXT: packuswb %xmm6, %xmm4
985 ; SSE-NEXT: pand %xmm8, %xmm3
986 ; SSE-NEXT: pand %xmm8, %xmm2
987 ; SSE-NEXT: packuswb %xmm3, %xmm2
988 ; SSE-NEXT: pand %xmm8, %xmm1
989 ; SSE-NEXT: pand %xmm8, %xmm0
990 ; SSE-NEXT: packuswb %xmm1, %xmm0
991 ; SSE-NEXT: packuswb %xmm2, %xmm0
992 ; SSE-NEXT: packuswb %xmm4, %xmm0
995 ; AVX1-LABEL: trunc_sub_v16i64_v16i8:
997 ; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm8
998 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
999 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1000 ; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm0
1001 ; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm4
1002 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
1003 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1004 ; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm1
1005 ; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm5
1006 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6
1007 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
1008 ; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm2
1009 ; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm6
1010 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7
1011 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
1012 ; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm3
1013 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255]
1014 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
1015 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
1016 ; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3
1017 ; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
1018 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
1019 ; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2
1020 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1021 ; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1
1022 ; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3
1023 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
1024 ; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0
1025 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3
1026 ; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0
1027 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1028 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1029 ; AVX1-NEXT: vzeroupper
1032 ; AVX2-LABEL: trunc_sub_v16i64_v16i8:
1034 ; AVX2-NEXT: vpsubq %ymm4, %ymm0, %ymm0
1035 ; AVX2-NEXT: vpsubq %ymm5, %ymm1, %ymm1
1036 ; AVX2-NEXT: vpsubq %ymm6, %ymm2, %ymm2
1037 ; AVX2-NEXT: vpsubq %ymm7, %ymm3, %ymm3
1038 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
1039 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
1040 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
1041 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
1042 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
1043 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
1044 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
1045 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
1046 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1047 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
1048 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1049 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1050 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1051 ; AVX2-NEXT: vzeroupper
1054 ; AVX512-LABEL: trunc_sub_v16i64_v16i8:
1056 ; AVX512-NEXT: vpsubq %zmm2, %zmm0, %zmm0
1057 ; AVX512-NEXT: vpsubq %zmm3, %zmm1, %zmm1
1058 ; AVX512-NEXT: vpmovqb %zmm1, %xmm1
1059 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
1060 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1061 ; AVX512-NEXT: vzeroupper
1063 %1 = sub <16 x i64> %a0, %a1
1064 %2 = trunc <16 x i64> %1 to <16 x i8>
1068 define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
1069 ; SSE-LABEL: trunc_sub_v16i32_v16i8:
1071 ; SSE-NEXT: psubd %xmm4, %xmm0
1072 ; SSE-NEXT: psubd %xmm5, %xmm1
1073 ; SSE-NEXT: psubd %xmm6, %xmm2
1074 ; SSE-NEXT: psubd %xmm7, %xmm3
1075 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1076 ; SSE-NEXT: pand %xmm4, %xmm3
1077 ; SSE-NEXT: pand %xmm4, %xmm2
1078 ; SSE-NEXT: packuswb %xmm3, %xmm2
1079 ; SSE-NEXT: pand %xmm4, %xmm1
1080 ; SSE-NEXT: pand %xmm4, %xmm0
1081 ; SSE-NEXT: packuswb %xmm1, %xmm0
1082 ; SSE-NEXT: packuswb %xmm2, %xmm0
1085 ; AVX1-LABEL: trunc_sub_v16i32_v16i8:
1087 ; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm4
1088 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
1089 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1090 ; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0
1091 ; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm2
1092 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
1093 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1094 ; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1
1095 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255]
1096 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
1097 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
1098 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
1099 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1100 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2
1101 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
1102 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1103 ; AVX1-NEXT: vzeroupper
1106 ; AVX2-LABEL: trunc_sub_v16i32_v16i8:
1108 ; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0
1109 ; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm1
1110 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
1111 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
1112 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
1113 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
1114 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1115 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1116 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1117 ; AVX2-NEXT: vzeroupper
1120 ; AVX512-LABEL: trunc_sub_v16i32_v16i8:
1122 ; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0
1123 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
1124 ; AVX512-NEXT: vzeroupper
1126 %1 = sub <16 x i32> %a0, %a1
1127 %2 = trunc <16 x i32> %1 to <16 x i8>
1131 define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
1132 ; SSE-LABEL: trunc_sub_v16i16_v16i8:
1134 ; SSE-NEXT: psubw %xmm2, %xmm0
1135 ; SSE-NEXT: psubw %xmm3, %xmm1
1136 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
1137 ; SSE-NEXT: pand %xmm2, %xmm1
1138 ; SSE-NEXT: pand %xmm2, %xmm0
1139 ; SSE-NEXT: packuswb %xmm1, %xmm0
1142 ; AVX1-LABEL: trunc_sub_v16i16_v16i8:
1144 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2
1145 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1146 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1147 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0
1148 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
1149 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
1150 ; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1
1151 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
1152 ; AVX1-NEXT: vzeroupper
1155 ; AVX2-LABEL: trunc_sub_v16i16_v16i8:
1157 ; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0
1158 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1159 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1160 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1161 ; AVX2-NEXT: vzeroupper
1164 ; AVX512F-LABEL: trunc_sub_v16i16_v16i8:
1166 ; AVX512F-NEXT: vpsubw %ymm1, %ymm0, %ymm0
1167 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1168 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
1169 ; AVX512F-NEXT: vzeroupper
1170 ; AVX512F-NEXT: retq
1172 ; AVX512BW-LABEL: trunc_sub_v16i16_v16i8:
1173 ; AVX512BW: # %bb.0:
1174 ; AVX512BW-NEXT: vpsubw %ymm1, %ymm0, %ymm0
1175 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1176 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1177 ; AVX512BW-NEXT: vzeroupper
1178 ; AVX512BW-NEXT: retq
1180 ; AVX512DQ-LABEL: trunc_sub_v16i16_v16i8:
1181 ; AVX512DQ: # %bb.0:
1182 ; AVX512DQ-NEXT: vpsubw %ymm1, %ymm0, %ymm0
1183 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1184 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
1185 ; AVX512DQ-NEXT: vzeroupper
1186 ; AVX512DQ-NEXT: retq
1187 %1 = sub <16 x i16> %a0, %a1
1188 %2 = trunc <16 x i16> %1 to <16 x i8>
1192 define <16 x i8> @trunc_ext_sub_v16i16_v16i8(<16 x i8> %x, <16 x i8> %y) {
1193 ; SSE-LABEL: trunc_ext_sub_v16i16_v16i8:
1195 ; SSE-NEXT: psubb %xmm1, %xmm0
1198 ; AVX-LABEL: trunc_ext_sub_v16i16_v16i8:
1200 ; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
1202 %a = zext <16 x i8> %x to <16 x i16>
1203 %b = zext <16 x i8> %y to <16 x i16>
1204 %c = sub <16 x i16> %a, %b
1205 %d = trunc <16 x i16> %c to <16 x i8>
1213 define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
1214 ; SSE-LABEL: trunc_sub_const_v4i64_v4i32:
1216 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1217 ; SSE-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1220 ; AVX1-LABEL: trunc_sub_const_v4i64_v4i32:
1222 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1223 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1224 ; AVX1-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1225 ; AVX1-NEXT: vzeroupper
1228 ; AVX2-SLOW-LABEL: trunc_sub_const_v4i64_v4i32:
1229 ; AVX2-SLOW: # %bb.0:
1230 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
1231 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1232 ; AVX2-SLOW-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1233 ; AVX2-SLOW-NEXT: vzeroupper
1234 ; AVX2-SLOW-NEXT: retq
1236 ; AVX2-FAST-ALL-LABEL: trunc_sub_const_v4i64_v4i32:
1237 ; AVX2-FAST-ALL: # %bb.0:
1238 ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
1239 ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
1240 ; AVX2-FAST-ALL-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1241 ; AVX2-FAST-ALL-NEXT: vzeroupper
1242 ; AVX2-FAST-ALL-NEXT: retq
1244 ; AVX2-FAST-PERLANE-LABEL: trunc_sub_const_v4i64_v4i32:
1245 ; AVX2-FAST-PERLANE: # %bb.0:
1246 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
1247 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1248 ; AVX2-FAST-PERLANE-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1249 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
1250 ; AVX2-FAST-PERLANE-NEXT: retq
1252 ; AVX512-LABEL: trunc_sub_const_v4i64_v4i32:
1254 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1255 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
1256 ; AVX512-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1257 ; AVX512-NEXT: vzeroupper
1259 %1 = sub <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
1260 %2 = trunc <4 x i64> %1 to <4 x i32>
1264 define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
1265 ; SSE-LABEL: trunc_sub_const_v8i64_v8i16:
1267 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1268 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1269 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1270 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
1271 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1272 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
1273 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
1274 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
1275 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
1276 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1277 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
1278 ; SSE-NEXT: psubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1281 ; AVX1-LABEL: trunc_sub_const_v8i64_v8i16:
1283 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
1284 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
1285 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1286 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
1287 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
1288 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1289 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
1290 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1291 ; AVX1-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1292 ; AVX1-NEXT: vzeroupper
1295 ; AVX2-LABEL: trunc_sub_const_v8i64_v8i16:
1297 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1298 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
1299 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
1300 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
1301 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1302 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1303 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1304 ; AVX2-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1305 ; AVX2-NEXT: vzeroupper
1308 ; AVX512-LABEL: trunc_sub_const_v8i64_v8i16:
1310 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
1311 ; AVX512-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1312 ; AVX512-NEXT: vzeroupper
1314 %1 = sub <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
1315 %2 = trunc <8 x i64> %1 to <8 x i16>
1319 define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
1320 ; SSE-LABEL: trunc_sub_const_v8i32_v8i16:
1322 ; SSE-NEXT: pslld $16, %xmm1
1323 ; SSE-NEXT: psrad $16, %xmm1
1324 ; SSE-NEXT: pslld $16, %xmm0
1325 ; SSE-NEXT: psrad $16, %xmm0
1326 ; SSE-NEXT: packssdw %xmm1, %xmm0
1327 ; SSE-NEXT: psubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1330 ; AVX1-LABEL: trunc_sub_const_v8i32_v8i16:
1332 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1333 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
1334 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1335 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1336 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1337 ; AVX1-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1338 ; AVX1-NEXT: vzeroupper
1341 ; AVX2-LABEL: trunc_sub_const_v8i32_v8i16:
1343 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
1344 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1345 ; AVX2-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1346 ; AVX2-NEXT: vzeroupper
1349 ; AVX512-LABEL: trunc_sub_const_v8i32_v8i16:
1351 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1352 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
1353 ; AVX512-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1354 ; AVX512-NEXT: vzeroupper
1356 %1 = sub <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1357 %2 = trunc <8 x i32> %1 to <8 x i16>
1361 define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
1362 ; SSE-LABEL: trunc_sub_const_v16i64_v16i8:
1364 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1365 ; SSE-NEXT: pand %xmm8, %xmm7
1366 ; SSE-NEXT: pand %xmm8, %xmm6
1367 ; SSE-NEXT: packuswb %xmm7, %xmm6
1368 ; SSE-NEXT: pand %xmm8, %xmm5
1369 ; SSE-NEXT: pand %xmm8, %xmm4
1370 ; SSE-NEXT: packuswb %xmm5, %xmm4
1371 ; SSE-NEXT: packuswb %xmm6, %xmm4
1372 ; SSE-NEXT: pand %xmm8, %xmm3
1373 ; SSE-NEXT: pand %xmm8, %xmm2
1374 ; SSE-NEXT: packuswb %xmm3, %xmm2
1375 ; SSE-NEXT: pand %xmm8, %xmm1
1376 ; SSE-NEXT: pand %xmm8, %xmm0
1377 ; SSE-NEXT: packuswb %xmm1, %xmm0
1378 ; SSE-NEXT: packuswb %xmm2, %xmm0
1379 ; SSE-NEXT: packuswb %xmm4, %xmm0
1380 ; SSE-NEXT: psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1383 ; AVX1-LABEL: trunc_sub_const_v16i64_v16i8:
1385 ; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
1386 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
1387 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
1388 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
1389 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
1390 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
1391 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
1392 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1393 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
1394 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1395 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
1396 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
1397 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1398 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
1399 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1400 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1401 ; AVX1-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1402 ; AVX1-NEXT: vzeroupper
1405 ; AVX2-LABEL: trunc_sub_const_v16i64_v16i8:
1407 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
1408 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
1409 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
1410 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
1411 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
1412 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
1413 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
1414 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
1415 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1416 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
1417 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1418 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1419 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1420 ; AVX2-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1421 ; AVX2-NEXT: vzeroupper
1424 ; AVX512-LABEL: trunc_sub_const_v16i64_v16i8:
1426 ; AVX512-NEXT: vpmovqb %zmm1, %xmm1
1427 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
1428 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1429 ; AVX512-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1430 ; AVX512-NEXT: vzeroupper
1432 %1 = sub <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
1433 %2 = trunc <16 x i64> %1 to <16 x i8>
1437 define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
1438 ; SSE-LABEL: trunc_sub_const_v16i32_v16i8:
1440 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1441 ; SSE-NEXT: pand %xmm4, %xmm3
1442 ; SSE-NEXT: pand %xmm4, %xmm2
1443 ; SSE-NEXT: packuswb %xmm3, %xmm2
1444 ; SSE-NEXT: pand %xmm4, %xmm1
1445 ; SSE-NEXT: pand %xmm4, %xmm0
1446 ; SSE-NEXT: packuswb %xmm1, %xmm0
1447 ; SSE-NEXT: packuswb %xmm2, %xmm0
1448 ; SSE-NEXT: psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1451 ; AVX1-LABEL: trunc_sub_const_v16i32_v16i8:
1453 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
1454 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
1455 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1456 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
1457 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
1458 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1459 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
1460 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1461 ; AVX1-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1462 ; AVX1-NEXT: vzeroupper
1465 ; AVX2-LABEL: trunc_sub_const_v16i32_v16i8:
1467 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
1468 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
1469 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
1470 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
1471 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1472 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1473 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1474 ; AVX2-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1475 ; AVX2-NEXT: vzeroupper
1478 ; AVX512-LABEL: trunc_sub_const_v16i32_v16i8:
1480 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
1481 ; AVX512-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1482 ; AVX512-NEXT: vzeroupper
1484 %1 = sub <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1485 %2 = trunc <16 x i32> %1 to <16 x i8>
1489 define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
1490 ; SSE-LABEL: trunc_sub_const_v16i16_v16i8:
1492 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
1493 ; SSE-NEXT: pand %xmm2, %xmm1
1494 ; SSE-NEXT: pand %xmm2, %xmm0
1495 ; SSE-NEXT: packuswb %xmm1, %xmm0
1496 ; SSE-NEXT: psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1499 ; AVX1-LABEL: trunc_sub_const_v16i16_v16i8:
1501 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1502 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1503 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1504 ; AVX1-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1505 ; AVX1-NEXT: vzeroupper
1508 ; AVX2-LABEL: trunc_sub_const_v16i16_v16i8:
1510 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1511 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1512 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1513 ; AVX2-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1514 ; AVX2-NEXT: vzeroupper
1517 ; AVX512F-LABEL: trunc_sub_const_v16i16_v16i8:
1519 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1520 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
1521 ; AVX512F-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1522 ; AVX512F-NEXT: vzeroupper
1523 ; AVX512F-NEXT: retq
1525 ; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8:
1526 ; AVX512BW: # %bb.0:
1527 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1528 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1529 ; AVX512BW-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1530 ; AVX512BW-NEXT: vzeroupper
1531 ; AVX512BW-NEXT: retq
1533 ; AVX512DQ-LABEL: trunc_sub_const_v16i16_v16i8:
1534 ; AVX512DQ: # %bb.0:
1535 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1536 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
1537 ; AVX512DQ-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1538 ; AVX512DQ-NEXT: vzeroupper
1539 ; AVX512DQ-NEXT: retq
1540 %1 = sub <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1541 %2 = trunc <16 x i16> %1 to <16 x i8>
1545 define <16 x i8> @trunc_ext_sub_const_rhs_v16i16_v16i8(<16 x i8> %x) {
1546 ; SSE-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
1548 ; SSE-NEXT: psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1551 ; AVX-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
1553 ; AVX-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1555 %a = zext <16 x i8> %x to <16 x i16>
1556 %b = sub <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1557 %c = trunc <16 x i16> %b to <16 x i8>
1561 define <16 x i8> @trunc_ext_sub_const_lhs_v16i16_v16i8(<16 x i8> %x) {
1562 ; SSE-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
1564 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1565 ; SSE-NEXT: psubb %xmm0, %xmm1
1566 ; SSE-NEXT: movdqa %xmm1, %xmm0
1569 ; AVX-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
1571 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1572 ; AVX-NEXT: vpsubb %xmm0, %xmm1, %xmm0
1574 %a = zext <16 x i8> %x to <16 x i16>
1575 %b = sub <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %a
1576 %c = trunc <16 x i16> %b to <16 x i8>
1584 define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
1585 ; SSE-LABEL: trunc_mul_v4i64_v4i32:
1587 ; SSE-NEXT: pmuludq %xmm3, %xmm1
1588 ; SSE-NEXT: pmuludq %xmm2, %xmm0
1589 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1592 ; AVX1-LABEL: trunc_mul_v4i64_v4i32:
1594 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1595 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
1596 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1597 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
1598 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1599 ; AVX1-NEXT: vzeroupper
1602 ; AVX2-SLOW-LABEL: trunc_mul_v4i64_v4i32:
1603 ; AVX2-SLOW: # %bb.0:
1604 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
1605 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
1606 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
1607 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
1608 ; AVX2-SLOW-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1609 ; AVX2-SLOW-NEXT: vzeroupper
1610 ; AVX2-SLOW-NEXT: retq
1612 ; AVX2-FAST-ALL-LABEL: trunc_mul_v4i64_v4i32:
1613 ; AVX2-FAST-ALL: # %bb.0:
1614 ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
1615 ; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1
1616 ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm2, %ymm0
1617 ; AVX2-FAST-ALL-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1618 ; AVX2-FAST-ALL-NEXT: vzeroupper
1619 ; AVX2-FAST-ALL-NEXT: retq
1621 ; AVX2-FAST-PERLANE-LABEL: trunc_mul_v4i64_v4i32:
1622 ; AVX2-FAST-PERLANE: # %bb.0:
1623 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2
1624 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
1625 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm2
1626 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
1627 ; AVX2-FAST-PERLANE-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1628 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
1629 ; AVX2-FAST-PERLANE-NEXT: retq
1631 ; AVX512F-LABEL: trunc_mul_v4i64_v4i32:
1633 ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1634 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1635 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
1636 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
1637 ; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1638 ; AVX512F-NEXT: vzeroupper
1639 ; AVX512F-NEXT: retq
1641 ; AVX512BW-LABEL: trunc_mul_v4i64_v4i32:
1642 ; AVX512BW: # %bb.0:
1643 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1644 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1645 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
1646 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
1647 ; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1648 ; AVX512BW-NEXT: vzeroupper
1649 ; AVX512BW-NEXT: retq
1651 ; AVX512DQ-LABEL: trunc_mul_v4i64_v4i32:
1652 ; AVX512DQ: # %bb.0:
1653 ; AVX512DQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1654 ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1655 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
1656 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
1657 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1658 ; AVX512DQ-NEXT: vzeroupper
1659 ; AVX512DQ-NEXT: retq
1660 %1 = mul <4 x i64> %a0, %a1
1661 %2 = trunc <4 x i64> %1 to <4 x i32>
1665 define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
1666 ; SSE-LABEL: trunc_mul_v8i64_v8i16:
1668 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
1669 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
1670 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1671 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
1672 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
1673 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3]
1674 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,0,2,4,5,6,7]
1675 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
1676 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,2,4,5,6,7]
1677 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
1678 ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1]
1679 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1680 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1681 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1682 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
1683 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1684 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
1685 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
1686 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
1687 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
1688 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1689 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
1690 ; SSE-NEXT: pmullw %xmm6, %xmm0
1693 ; AVX1-LABEL: trunc_mul_v8i64_v8i16:
1695 ; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535]
1696 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
1697 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
1698 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
1699 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
1700 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
1701 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
1702 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1703 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
1704 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1705 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
1706 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
1707 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1708 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
1709 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1710 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
1711 ; AVX1-NEXT: vzeroupper
1714 ; AVX2-LABEL: trunc_mul_v8i64_v8i16:
1716 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
1717 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7],ymm3[8],ymm4[9,10,11],ymm3[12],ymm4[13,14,15]
1718 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7],ymm2[8],ymm4[9,10,11],ymm2[12],ymm4[13,14,15]
1719 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
1720 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
1721 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1722 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1,2,3],ymm1[4],ymm4[5,6,7],ymm1[8],ymm4[9,10,11],ymm1[12],ymm4[13,14,15]
1723 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7],ymm0[8],ymm4[9,10,11],ymm0[12],ymm4[13,14,15]
1724 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
1725 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1726 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1727 ; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0
1728 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1729 ; AVX2-NEXT: vzeroupper
1732 ; AVX512F-LABEL: trunc_mul_v8i64_v8i16:
1734 ; AVX512F-NEXT: vpmovqw %zmm1, %xmm1
1735 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
1736 ; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1737 ; AVX512F-NEXT: vzeroupper
1738 ; AVX512F-NEXT: retq
1740 ; AVX512BW-LABEL: trunc_mul_v8i64_v8i16:
1741 ; AVX512BW: # %bb.0:
1742 ; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1
1743 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
1744 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1745 ; AVX512BW-NEXT: vzeroupper
1746 ; AVX512BW-NEXT: retq
1748 ; AVX512DQ-LABEL: trunc_mul_v8i64_v8i16:
1749 ; AVX512DQ: # %bb.0:
1750 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
1751 ; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0
1752 ; AVX512DQ-NEXT: vzeroupper
1753 ; AVX512DQ-NEXT: retq
1754 %1 = mul <8 x i64> %a0, %a1
1755 %2 = trunc <8 x i64> %1 to <8 x i16>
1759 define <8 x i16> @trunc_mul_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
1760 ; SSE-LABEL: trunc_mul_v8i32_v8i16:
1762 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
1763 ; SSE-NEXT: pmuludq %xmm2, %xmm0
1764 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1765 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1766 ; SSE-NEXT: pmuludq %xmm4, %xmm2
1767 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1768 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1769 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1770 ; SSE-NEXT: pmuludq %xmm3, %xmm1
1771 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1772 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1773 ; SSE-NEXT: pmuludq %xmm2, %xmm3
1774 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
1775 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1776 ; SSE-NEXT: pslld $16, %xmm1
1777 ; SSE-NEXT: psrad $16, %xmm1
1778 ; SSE-NEXT: pslld $16, %xmm0
1779 ; SSE-NEXT: psrad $16, %xmm0
1780 ; SSE-NEXT: packssdw %xmm1, %xmm0
1783 ; AVX1-LABEL: trunc_mul_v8i32_v8i16:
1785 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm2
1786 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1787 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1788 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1789 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
1790 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
1791 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
1792 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1793 ; AVX1-NEXT: vzeroupper
1796 ; AVX2-LABEL: trunc_mul_v8i32_v8i16:
1798 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
1799 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
1800 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1801 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1802 ; AVX2-NEXT: vzeroupper
1805 ; AVX512-LABEL: trunc_mul_v8i32_v8i16:
1807 ; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0
1808 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
1809 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1810 ; AVX512-NEXT: vzeroupper
1812 %1 = mul <8 x i32> %a0, %a1
1813 %2 = trunc <8 x i32> %1 to <8 x i16>
1817 define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
1818 ; SSE-LABEL: trunc_mul_v16i64_v16i8:
1820 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm0
1821 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm1
1822 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm2
1823 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm3
1824 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm4
1825 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm5
1826 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm6
1827 ; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm7
1828 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1829 ; SSE-NEXT: pand %xmm8, %xmm7
1830 ; SSE-NEXT: pand %xmm8, %xmm6
1831 ; SSE-NEXT: packuswb %xmm7, %xmm6
1832 ; SSE-NEXT: pand %xmm8, %xmm5
1833 ; SSE-NEXT: pand %xmm8, %xmm4
1834 ; SSE-NEXT: packuswb %xmm5, %xmm4
1835 ; SSE-NEXT: packuswb %xmm6, %xmm4
1836 ; SSE-NEXT: pand %xmm8, %xmm3
1837 ; SSE-NEXT: pand %xmm8, %xmm2
1838 ; SSE-NEXT: packuswb %xmm3, %xmm2
1839 ; SSE-NEXT: pand %xmm8, %xmm1
1840 ; SSE-NEXT: pand %xmm8, %xmm0
1841 ; SSE-NEXT: packuswb %xmm1, %xmm0
1842 ; SSE-NEXT: packuswb %xmm2, %xmm0
1843 ; SSE-NEXT: packuswb %xmm4, %xmm0
1846 ; AVX1-LABEL: trunc_mul_v16i64_v16i8:
1848 ; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm8
1849 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
1850 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1851 ; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm0
1852 ; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm4
1853 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
1854 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1855 ; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm1
1856 ; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm5
1857 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6
1858 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
1859 ; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm2
1860 ; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm6
1861 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7
1862 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
1863 ; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm3
1864 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255]
1865 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
1866 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
1867 ; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3
1868 ; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
1869 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
1870 ; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2
1871 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1872 ; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1
1873 ; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3
1874 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
1875 ; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0
1876 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3
1877 ; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0
1878 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1879 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1880 ; AVX1-NEXT: vzeroupper
1883 ; AVX2-LABEL: trunc_mul_v16i64_v16i8:
1885 ; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm0
1886 ; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm1
1887 ; AVX2-NEXT: vpmuludq %ymm6, %ymm2, %ymm2
1888 ; AVX2-NEXT: vpmuludq %ymm7, %ymm3, %ymm3
1889 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
1890 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
1891 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
1892 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
1893 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
1894 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
1895 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
1896 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
1897 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1898 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
1899 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1900 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1901 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1902 ; AVX2-NEXT: vzeroupper
1905 ; AVX512F-LABEL: trunc_mul_v16i64_v16i8:
1907 ; AVX512F-NEXT: vpmuludq %zmm2, %zmm0, %zmm0
1908 ; AVX512F-NEXT: vpmuludq %zmm3, %zmm1, %zmm1
1909 ; AVX512F-NEXT: vpmovqb %zmm1, %xmm1
1910 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
1911 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1912 ; AVX512F-NEXT: vzeroupper
1913 ; AVX512F-NEXT: retq
1915 ; AVX512BW-LABEL: trunc_mul_v16i64_v16i8:
1916 ; AVX512BW: # %bb.0:
1917 ; AVX512BW-NEXT: vpmuludq %zmm2, %zmm0, %zmm0
1918 ; AVX512BW-NEXT: vpmuludq %zmm3, %zmm1, %zmm1
1919 ; AVX512BW-NEXT: vpmovqb %zmm1, %xmm1
1920 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
1921 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1922 ; AVX512BW-NEXT: vzeroupper
1923 ; AVX512BW-NEXT: retq
1925 ; AVX512DQ-LABEL: trunc_mul_v16i64_v16i8:
1926 ; AVX512DQ: # %bb.0:
1927 ; AVX512DQ-NEXT: vpmullq %zmm2, %zmm0, %zmm0
1928 ; AVX512DQ-NEXT: vpmullq %zmm3, %zmm1, %zmm1
1929 ; AVX512DQ-NEXT: vpmovqb %zmm1, %xmm1
1930 ; AVX512DQ-NEXT: vpmovqb %zmm0, %xmm0
1931 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1932 ; AVX512DQ-NEXT: vzeroupper
1933 ; AVX512DQ-NEXT: retq
1934 %1 = mul <16 x i64> %a0, %a1
1935 %2 = trunc <16 x i64> %1 to <16 x i8>
1939 define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
1940 ; SSE-LABEL: trunc_mul_v16i32_v16i8:
1942 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
1943 ; SSE-NEXT: pmuludq %xmm4, %xmm0
1944 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1945 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
1946 ; SSE-NEXT: pmuludq %xmm8, %xmm4
1947 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1948 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
1949 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
1950 ; SSE-NEXT: pmuludq %xmm5, %xmm1
1951 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1952 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
1953 ; SSE-NEXT: pmuludq %xmm4, %xmm5
1954 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
1955 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
1956 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
1957 ; SSE-NEXT: pmuludq %xmm6, %xmm2
1958 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1959 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
1960 ; SSE-NEXT: pmuludq %xmm4, %xmm5
1961 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
1962 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
1963 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
1964 ; SSE-NEXT: pmuludq %xmm7, %xmm3
1965 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
1966 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
1967 ; SSE-NEXT: pmuludq %xmm4, %xmm5
1968 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
1969 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
1970 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1971 ; SSE-NEXT: pand %xmm4, %xmm3
1972 ; SSE-NEXT: pand %xmm4, %xmm2
1973 ; SSE-NEXT: packuswb %xmm3, %xmm2
1974 ; SSE-NEXT: pand %xmm4, %xmm1
1975 ; SSE-NEXT: pand %xmm4, %xmm0
1976 ; SSE-NEXT: packuswb %xmm1, %xmm0
1977 ; SSE-NEXT: packuswb %xmm2, %xmm0
1980 ; AVX1-LABEL: trunc_mul_v16i32_v16i8:
1982 ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm4
1983 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
1984 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1985 ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
1986 ; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm2
1987 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
1988 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1989 ; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1
1990 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255]
1991 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
1992 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
1993 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
1994 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1995 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2
1996 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
1997 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1998 ; AVX1-NEXT: vzeroupper
2001 ; AVX2-LABEL: trunc_mul_v16i32_v16i8:
2003 ; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0
2004 ; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1
2005 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
2006 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
2007 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
2008 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
2009 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2010 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2011 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2012 ; AVX2-NEXT: vzeroupper
2015 ; AVX512-LABEL: trunc_mul_v16i32_v16i8:
2017 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
2018 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
2019 ; AVX512-NEXT: vzeroupper
2021 %1 = mul <16 x i32> %a0, %a1
2022 %2 = trunc <16 x i32> %1 to <16 x i8>
2026 define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
2027 ; SSE-LABEL: trunc_mul_v16i16_v16i8:
2029 ; SSE-NEXT: pmullw %xmm2, %xmm0
2030 ; SSE-NEXT: pmullw %xmm3, %xmm1
2031 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
2032 ; SSE-NEXT: pand %xmm2, %xmm1
2033 ; SSE-NEXT: pand %xmm2, %xmm0
2034 ; SSE-NEXT: packuswb %xmm1, %xmm0
2037 ; AVX1-LABEL: trunc_mul_v16i16_v16i8:
2039 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm2
2040 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2041 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2042 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
2043 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
2044 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
2045 ; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1
2046 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
2047 ; AVX1-NEXT: vzeroupper
2050 ; AVX2-LABEL: trunc_mul_v16i16_v16i8:
2052 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
2053 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2054 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2055 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2056 ; AVX2-NEXT: vzeroupper
2059 ; AVX512F-LABEL: trunc_mul_v16i16_v16i8:
2061 ; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0
2062 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2063 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
2064 ; AVX512F-NEXT: vzeroupper
2065 ; AVX512F-NEXT: retq
2067 ; AVX512BW-LABEL: trunc_mul_v16i16_v16i8:
2068 ; AVX512BW: # %bb.0:
2069 ; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0
2070 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
2071 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2072 ; AVX512BW-NEXT: vzeroupper
2073 ; AVX512BW-NEXT: retq
2075 ; AVX512DQ-LABEL: trunc_mul_v16i16_v16i8:
2076 ; AVX512DQ: # %bb.0:
2077 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
2078 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2079 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
2080 ; AVX512DQ-NEXT: vzeroupper
2081 ; AVX512DQ-NEXT: retq
2082 %1 = mul <16 x i16> %a0, %a1
2083 %2 = trunc <16 x i16> %1 to <16 x i8>
2087 define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
2088 ; SSE-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2090 ; SSE-NEXT: pxor %xmm3, %xmm3
2091 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2092 ; SSE-NEXT: pslld $16, %xmm2
2093 ; SSE-NEXT: psrad $16, %xmm2
2094 ; SSE-NEXT: pslld $16, %xmm1
2095 ; SSE-NEXT: psrad $16, %xmm1
2096 ; SSE-NEXT: packssdw %xmm2, %xmm1
2097 ; SSE-NEXT: pmullw %xmm1, %xmm0
2100 ; AVX1-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2102 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2103 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
2104 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
2105 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
2106 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2107 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2108 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
2109 ; AVX1-NEXT: vzeroupper
2112 ; AVX2-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2114 ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2115 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
2116 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2117 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
2118 ; AVX2-NEXT: vzeroupper
2121 ; AVX512-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2123 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
2124 ; AVX512-NEXT: vpmovdw %zmm1, %ymm1
2125 ; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2126 ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
2127 ; AVX512-NEXT: vzeroupper
2129 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2130 %2 = zext <8 x i8> %1 to <8 x i32>
2131 %3 = mul <8 x i32> %2, %a1
2132 %4 = trunc <8 x i32> %3 to <8 x i16>
2140 define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
2141 ; SSE-LABEL: trunc_mul_const_v4i64_v4i32:
2143 ; SSE-NEXT: xorps %xmm2, %xmm2
2144 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
2145 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2146 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
2147 ; SSE-NEXT: movaps %xmm2, %xmm0
2150 ; AVX1-LABEL: trunc_mul_const_v4i64_v4i32:
2152 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2153 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2154 ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2155 ; AVX1-NEXT: vzeroupper
2158 ; AVX2-SLOW-LABEL: trunc_mul_const_v4i64_v4i32:
2159 ; AVX2-SLOW: # %bb.0:
2160 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
2161 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2162 ; AVX2-SLOW-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2163 ; AVX2-SLOW-NEXT: vzeroupper
2164 ; AVX2-SLOW-NEXT: retq
2166 ; AVX2-FAST-ALL-LABEL: trunc_mul_const_v4i64_v4i32:
2167 ; AVX2-FAST-ALL: # %bb.0:
2168 ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
2169 ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
2170 ; AVX2-FAST-ALL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2171 ; AVX2-FAST-ALL-NEXT: vzeroupper
2172 ; AVX2-FAST-ALL-NEXT: retq
2174 ; AVX2-FAST-PERLANE-LABEL: trunc_mul_const_v4i64_v4i32:
2175 ; AVX2-FAST-PERLANE: # %bb.0:
2176 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
2177 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2178 ; AVX2-FAST-PERLANE-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2179 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
2180 ; AVX2-FAST-PERLANE-NEXT: retq
2182 ; AVX512-LABEL: trunc_mul_const_v4i64_v4i32:
2184 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
2185 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
2186 ; AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2187 ; AVX512-NEXT: vzeroupper
2189 %1 = mul <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
2190 %2 = trunc <4 x i64> %1 to <4 x i32>
2194 define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
2195 ; SSE-LABEL: trunc_mul_const_v8i64_v8i16:
2197 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2198 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
2199 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2200 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
2201 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
2202 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
2203 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
2204 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
2205 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
2206 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2207 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
2208 ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2211 ; AVX1-LABEL: trunc_mul_const_v8i64_v8i16:
2213 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
2214 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
2215 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
2216 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
2217 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
2218 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2219 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
2220 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2221 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2222 ; AVX1-NEXT: vzeroupper
2225 ; AVX2-LABEL: trunc_mul_const_v8i64_v8i16:
2227 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
2228 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
2229 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
2230 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
2231 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2232 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2233 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2234 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2235 ; AVX2-NEXT: vzeroupper
2238 ; AVX512-LABEL: trunc_mul_const_v8i64_v8i16:
2240 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
2241 ; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2242 ; AVX512-NEXT: vzeroupper
2244 %1 = mul <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
2245 %2 = trunc <8 x i64> %1 to <8 x i16>
2249 define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
2250 ; SSE-LABEL: trunc_mul_const_v8i32_v8i16:
2252 ; SSE-NEXT: pslld $16, %xmm1
2253 ; SSE-NEXT: psrad $16, %xmm1
2254 ; SSE-NEXT: pslld $16, %xmm0
2255 ; SSE-NEXT: psrad $16, %xmm0
2256 ; SSE-NEXT: packssdw %xmm1, %xmm0
2257 ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2260 ; AVX1-LABEL: trunc_mul_const_v8i32_v8i16:
2262 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2263 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
2264 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
2265 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2266 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2267 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2268 ; AVX1-NEXT: vzeroupper
2271 ; AVX2-LABEL: trunc_mul_const_v8i32_v8i16:
2273 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
2274 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2275 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2276 ; AVX2-NEXT: vzeroupper
2279 ; AVX512-LABEL: trunc_mul_const_v8i32_v8i16:
2281 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
2282 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
2283 ; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2284 ; AVX512-NEXT: vzeroupper
2286 %1 = mul <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2287 %2 = trunc <8 x i32> %1 to <8 x i16>
2291 define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
2292 ; SSE-LABEL: trunc_mul_const_v16i64_v16i8:
2294 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2295 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
2296 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
2297 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
2298 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
2299 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
2300 ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7
2301 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
2302 ; SSE-NEXT: pand %xmm8, %xmm7
2303 ; SSE-NEXT: pand %xmm8, %xmm6
2304 ; SSE-NEXT: packuswb %xmm7, %xmm6
2305 ; SSE-NEXT: pand %xmm8, %xmm5
2306 ; SSE-NEXT: pand %xmm8, %xmm4
2307 ; SSE-NEXT: packuswb %xmm5, %xmm4
2308 ; SSE-NEXT: packuswb %xmm6, %xmm4
2309 ; SSE-NEXT: pand %xmm8, %xmm3
2310 ; SSE-NEXT: pand %xmm8, %xmm2
2311 ; SSE-NEXT: packuswb %xmm3, %xmm2
2312 ; SSE-NEXT: pand %xmm8, %xmm1
2313 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2314 ; SSE-NEXT: packuswb %xmm1, %xmm0
2315 ; SSE-NEXT: packuswb %xmm2, %xmm0
2316 ; SSE-NEXT: packuswb %xmm4, %xmm0
2319 ; AVX1-LABEL: trunc_mul_const_v16i64_v16i8:
2321 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm8
2322 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2323 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2324 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm5
2325 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2326 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2327 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm6
2328 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
2329 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2330 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm7
2331 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
2332 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
2333 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255]
2334 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
2335 ; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7
2336 ; AVX1-NEXT: vpackusdw %xmm3, %xmm7, %xmm3
2337 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
2338 ; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6
2339 ; AVX1-NEXT: vpackusdw %xmm2, %xmm6, %xmm2
2340 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
2341 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
2342 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3
2343 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
2344 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
2345 ; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3
2346 ; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0
2347 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2348 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2349 ; AVX1-NEXT: vzeroupper
2352 ; AVX2-LABEL: trunc_mul_const_v16i64_v16i8:
2354 ; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2355 ; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2356 ; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
2357 ; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
2358 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
2359 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
2360 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
2361 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
2362 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
2363 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
2364 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
2365 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
2366 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
2367 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
2368 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2369 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2370 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2371 ; AVX2-NEXT: vzeroupper
2374 ; AVX512F-LABEL: trunc_mul_const_v16i64_v16i8:
2376 ; AVX512F-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2377 ; AVX512F-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2378 ; AVX512F-NEXT: vpmovqb %zmm1, %xmm1
2379 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
2380 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2381 ; AVX512F-NEXT: vzeroupper
2382 ; AVX512F-NEXT: retq
2384 ; AVX512BW-LABEL: trunc_mul_const_v16i64_v16i8:
2385 ; AVX512BW: # %bb.0:
2386 ; AVX512BW-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2387 ; AVX512BW-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2388 ; AVX512BW-NEXT: vpmovqb %zmm1, %xmm1
2389 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
2390 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2391 ; AVX512BW-NEXT: vzeroupper
2392 ; AVX512BW-NEXT: retq
2394 ; AVX512DQ-LABEL: trunc_mul_const_v16i64_v16i8:
2395 ; AVX512DQ: # %bb.0:
2396 ; AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2397 ; AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2398 ; AVX512DQ-NEXT: vpmovqb %zmm1, %xmm1
2399 ; AVX512DQ-NEXT: vpmovqb %zmm0, %xmm0
2400 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2401 ; AVX512DQ-NEXT: vzeroupper
2402 ; AVX512DQ-NEXT: retq
2403 %1 = mul <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
2404 %2 = trunc <16 x i64> %1 to <16 x i8>
2408 define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
2409 ; SSE-LABEL: trunc_mul_const_v16i32_v16i8:
2411 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,1,2,3]
2412 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
2413 ; SSE-NEXT: pmuludq %xmm4, %xmm0
2414 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2415 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2416 ; SSE-NEXT: pmuludq %xmm5, %xmm4
2417 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2418 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
2419 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [4,5,6,7]
2420 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
2421 ; SSE-NEXT: pmuludq %xmm4, %xmm1
2422 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2423 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2424 ; SSE-NEXT: pmuludq %xmm5, %xmm4
2425 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2426 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
2427 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [8,9,10,11]
2428 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
2429 ; SSE-NEXT: pmuludq %xmm4, %xmm2
2430 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2431 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2432 ; SSE-NEXT: pmuludq %xmm5, %xmm4
2433 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2434 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
2435 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [12,13,14,15]
2436 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
2437 ; SSE-NEXT: pmuludq %xmm4, %xmm3
2438 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
2439 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2440 ; SSE-NEXT: pmuludq %xmm5, %xmm4
2441 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2442 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
2443 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
2444 ; SSE-NEXT: pand %xmm4, %xmm3
2445 ; SSE-NEXT: pand %xmm4, %xmm2
2446 ; SSE-NEXT: packuswb %xmm3, %xmm2
2447 ; SSE-NEXT: pand %xmm4, %xmm1
2448 ; SSE-NEXT: pand %xmm4, %xmm0
2449 ; SSE-NEXT: packuswb %xmm1, %xmm0
2450 ; SSE-NEXT: packuswb %xmm2, %xmm0
2453 ; AVX1-LABEL: trunc_mul_const_v16i32_v16i8:
2455 ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
2456 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2457 ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2458 ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
2459 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2460 ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2461 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255]
2462 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
2463 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
2464 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
2465 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
2466 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
2467 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
2468 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2469 ; AVX1-NEXT: vzeroupper
2472 ; AVX2-LABEL: trunc_mul_const_v16i32_v16i8:
2474 ; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2475 ; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2476 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
2477 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
2478 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
2479 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
2480 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2481 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2482 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2483 ; AVX2-NEXT: vzeroupper
2486 ; AVX512-LABEL: trunc_mul_const_v16i32_v16i8:
2488 ; AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2489 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
2490 ; AVX512-NEXT: vzeroupper
2492 %1 = mul <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2493 %2 = trunc <16 x i32> %1 to <16 x i8>
2497 define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
2498 ; SSE-LABEL: trunc_mul_const_v16i16_v16i8:
2500 ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2501 ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2502 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
2503 ; SSE-NEXT: pand %xmm2, %xmm1
2504 ; SSE-NEXT: pand %xmm2, %xmm0
2505 ; SSE-NEXT: packuswb %xmm1, %xmm0
2508 ; AVX1-LABEL: trunc_mul_const_v16i16_v16i8:
2510 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2511 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2512 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2513 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
2514 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
2515 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
2516 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
2517 ; AVX1-NEXT: vzeroupper
2520 ; AVX2-LABEL: trunc_mul_const_v16i16_v16i8:
2522 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2523 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2524 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2525 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2526 ; AVX2-NEXT: vzeroupper
2529 ; AVX512F-LABEL: trunc_mul_const_v16i16_v16i8:
2531 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2532 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2533 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
2534 ; AVX512F-NEXT: vzeroupper
2535 ; AVX512F-NEXT: retq
2537 ; AVX512BW-LABEL: trunc_mul_const_v16i16_v16i8:
2538 ; AVX512BW: # %bb.0:
2539 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2540 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
2541 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2542 ; AVX512BW-NEXT: vzeroupper
2543 ; AVX512BW-NEXT: retq
2545 ; AVX512DQ-LABEL: trunc_mul_const_v16i16_v16i8:
2546 ; AVX512DQ: # %bb.0:
2547 ; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2548 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2549 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
2550 ; AVX512DQ-NEXT: vzeroupper
2551 ; AVX512DQ-NEXT: retq
2552 %1 = mul <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
2553 %2 = trunc <16 x i16> %1 to <16 x i8>
2561 define <4 x i32> @trunc_and_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2562 ; SSE-LABEL: trunc_and_v4i64_v4i32:
2564 ; SSE-NEXT: andps %xmm3, %xmm1
2565 ; SSE-NEXT: andps %xmm2, %xmm0
2566 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2569 ; AVX1-LABEL: trunc_and_v4i64_v4i32:
2571 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
2572 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2573 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2574 ; AVX1-NEXT: vzeroupper
2577 ; AVX2-SLOW-LABEL: trunc_and_v4i64_v4i32:
2578 ; AVX2-SLOW: # %bb.0:
2579 ; AVX2-SLOW-NEXT: vandps %ymm1, %ymm0, %ymm0
2580 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
2581 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2582 ; AVX2-SLOW-NEXT: vzeroupper
2583 ; AVX2-SLOW-NEXT: retq
2585 ; AVX2-FAST-ALL-LABEL: trunc_and_v4i64_v4i32:
2586 ; AVX2-FAST-ALL: # %bb.0:
2587 ; AVX2-FAST-ALL-NEXT: vandps %ymm1, %ymm0, %ymm0
2588 ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
2589 ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
2590 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2591 ; AVX2-FAST-ALL-NEXT: vzeroupper
2592 ; AVX2-FAST-ALL-NEXT: retq
2594 ; AVX2-FAST-PERLANE-LABEL: trunc_and_v4i64_v4i32:
2595 ; AVX2-FAST-PERLANE: # %bb.0:
2596 ; AVX2-FAST-PERLANE-NEXT: vandps %ymm1, %ymm0, %ymm0
2597 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
2598 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2599 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
2600 ; AVX2-FAST-PERLANE-NEXT: retq
2602 ; AVX512-LABEL: trunc_and_v4i64_v4i32:
2604 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
2605 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
2606 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2607 ; AVX512-NEXT: vzeroupper
2609 %1 = and <4 x i64> %a0, %a1
2610 %2 = trunc <4 x i64> %1 to <4 x i32>
2614 define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
2615 ; SSE-LABEL: trunc_and_v8i64_v8i16:
2617 ; SSE-NEXT: pand %xmm6, %xmm2
2618 ; SSE-NEXT: pand %xmm7, %xmm3
2619 ; SSE-NEXT: pand %xmm4, %xmm0
2620 ; SSE-NEXT: pand %xmm5, %xmm1
2621 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2622 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
2623 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2624 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
2625 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
2626 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
2627 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
2628 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
2629 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
2630 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2631 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
2634 ; AVX1-LABEL: trunc_and_v8i64_v8i16:
2636 ; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535]
2637 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
2638 ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
2639 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
2640 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
2641 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
2642 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
2643 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2644 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
2645 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2646 ; AVX1-NEXT: vzeroupper
2649 ; AVX2-LABEL: trunc_and_v8i64_v8i16:
2651 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
2652 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
2653 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
2654 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
2655 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
2656 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
2657 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2658 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2659 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2660 ; AVX2-NEXT: vzeroupper
2663 ; AVX512-LABEL: trunc_and_v8i64_v8i16:
2665 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
2666 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
2667 ; AVX512-NEXT: vzeroupper
2669 %1 = and <8 x i64> %a0, %a1
2670 %2 = trunc <8 x i64> %1 to <8 x i16>
2674 define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
2675 ; SSE-LABEL: trunc_and_v8i32_v8i16:
2677 ; SSE-NEXT: pand %xmm2, %xmm0
2678 ; SSE-NEXT: pand %xmm3, %xmm1
2679 ; SSE-NEXT: pslld $16, %xmm1
2680 ; SSE-NEXT: psrad $16, %xmm1
2681 ; SSE-NEXT: pslld $16, %xmm0
2682 ; SSE-NEXT: psrad $16, %xmm0
2683 ; SSE-NEXT: packssdw %xmm1, %xmm0
2686 ; AVX1-LABEL: trunc_and_v8i32_v8i16:
2688 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
2689 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2690 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
2691 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
2692 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2693 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2694 ; AVX1-NEXT: vzeroupper
2697 ; AVX2-LABEL: trunc_and_v8i32_v8i16:
2699 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
2700 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
2701 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2702 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2703 ; AVX2-NEXT: vzeroupper
2706 ; AVX512-LABEL: trunc_and_v8i32_v8i16:
2708 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
2709 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
2710 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2711 ; AVX512-NEXT: vzeroupper
2713 %1 = and <8 x i32> %a0, %a1
2714 %2 = trunc <8 x i32> %1 to <8 x i16>
2718 define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
2719 ; SSE-LABEL: trunc_and_v16i64_v16i8:
2721 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm0
2722 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm1
2723 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm2
2724 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm3
2725 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm4
2726 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm5
2727 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm6
2728 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm7
2729 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
2730 ; SSE-NEXT: pand %xmm8, %xmm7
2731 ; SSE-NEXT: pand %xmm8, %xmm6
2732 ; SSE-NEXT: packuswb %xmm7, %xmm6
2733 ; SSE-NEXT: pand %xmm8, %xmm5
2734 ; SSE-NEXT: pand %xmm8, %xmm4
2735 ; SSE-NEXT: packuswb %xmm5, %xmm4
2736 ; SSE-NEXT: packuswb %xmm6, %xmm4
2737 ; SSE-NEXT: pand %xmm8, %xmm3
2738 ; SSE-NEXT: pand %xmm8, %xmm2
2739 ; SSE-NEXT: packuswb %xmm3, %xmm2
2740 ; SSE-NEXT: pand %xmm8, %xmm1
2741 ; SSE-NEXT: pand %xmm8, %xmm0
2742 ; SSE-NEXT: packuswb %xmm1, %xmm0
2743 ; SSE-NEXT: packuswb %xmm2, %xmm0
2744 ; SSE-NEXT: packuswb %xmm4, %xmm0
2747 ; AVX1-LABEL: trunc_and_v16i64_v16i8:
2749 ; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [255,255,255,255]
2750 ; AVX1-NEXT: vandps %ymm7, %ymm8, %ymm7
2751 ; AVX1-NEXT: vandps %ymm7, %ymm3, %ymm3
2752 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm7
2753 ; AVX1-NEXT: vpackusdw %xmm7, %xmm3, %xmm3
2754 ; AVX1-NEXT: vandps %ymm6, %ymm8, %ymm6
2755 ; AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2
2756 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
2757 ; AVX1-NEXT: vpackusdw %xmm6, %xmm2, %xmm2
2758 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
2759 ; AVX1-NEXT: vandps %ymm5, %ymm8, %ymm3
2760 ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
2761 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
2762 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
2763 ; AVX1-NEXT: vandps %ymm4, %ymm8, %ymm3
2764 ; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0
2765 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
2766 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
2767 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2768 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2769 ; AVX1-NEXT: vzeroupper
2772 ; AVX2-LABEL: trunc_and_v16i64_v16i8:
2774 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm8 = [255,255,255,255]
2775 ; AVX2-NEXT: vpand %ymm7, %ymm8, %ymm7
2776 ; AVX2-NEXT: vpand %ymm7, %ymm3, %ymm3
2777 ; AVX2-NEXT: vpand %ymm6, %ymm8, %ymm6
2778 ; AVX2-NEXT: vpand %ymm6, %ymm2, %ymm2
2779 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
2780 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
2781 ; AVX2-NEXT: vpand %ymm5, %ymm8, %ymm3
2782 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
2783 ; AVX2-NEXT: vpand %ymm4, %ymm8, %ymm3
2784 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
2785 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
2786 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
2787 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
2788 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2789 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2790 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2791 ; AVX2-NEXT: vzeroupper
2794 ; AVX512-LABEL: trunc_and_v16i64_v16i8:
2796 ; AVX512-NEXT: vpandq %zmm2, %zmm0, %zmm0
2797 ; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1
2798 ; AVX512-NEXT: vpmovqb %zmm1, %xmm1
2799 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
2800 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2801 ; AVX512-NEXT: vzeroupper
2803 %1 = and <16 x i64> %a0, %a1
2804 %2 = trunc <16 x i64> %1 to <16 x i8>
2808 define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
2809 ; SSE-LABEL: trunc_and_v16i32_v16i8:
2811 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
2812 ; SSE-NEXT: pand %xmm8, %xmm7
2813 ; SSE-NEXT: pand %xmm3, %xmm7
2814 ; SSE-NEXT: pand %xmm8, %xmm6
2815 ; SSE-NEXT: pand %xmm2, %xmm6
2816 ; SSE-NEXT: packuswb %xmm7, %xmm6
2817 ; SSE-NEXT: pand %xmm8, %xmm5
2818 ; SSE-NEXT: pand %xmm1, %xmm5
2819 ; SSE-NEXT: pand %xmm8, %xmm4
2820 ; SSE-NEXT: pand %xmm4, %xmm0
2821 ; SSE-NEXT: packuswb %xmm5, %xmm0
2822 ; SSE-NEXT: packuswb %xmm6, %xmm0
2825 ; AVX1-LABEL: trunc_and_v16i32_v16i8:
2827 ; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
2828 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
2829 ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
2830 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
2831 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
2832 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
2833 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
2834 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2835 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
2836 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2837 ; AVX1-NEXT: vzeroupper
2840 ; AVX2-LABEL: trunc_and_v16i32_v16i8:
2842 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
2843 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
2844 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
2845 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
2846 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
2847 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
2848 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2849 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2850 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2851 ; AVX2-NEXT: vzeroupper
2854 ; AVX512-LABEL: trunc_and_v16i32_v16i8:
2856 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
2857 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
2858 ; AVX512-NEXT: vzeroupper
2860 %1 = and <16 x i32> %a0, %a1
2861 %2 = trunc <16 x i32> %1 to <16 x i8>
2865 define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
2866 ; SSE-LABEL: trunc_and_v16i16_v16i8:
2868 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
2869 ; SSE-NEXT: pand %xmm4, %xmm3
2870 ; SSE-NEXT: pand %xmm1, %xmm3
2871 ; SSE-NEXT: pand %xmm4, %xmm2
2872 ; SSE-NEXT: pand %xmm2, %xmm0
2873 ; SSE-NEXT: packuswb %xmm3, %xmm0
2876 ; AVX1-LABEL: trunc_and_v16i16_v16i8:
2878 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
2879 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2880 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2881 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2882 ; AVX1-NEXT: vzeroupper
2885 ; AVX2-LABEL: trunc_and_v16i16_v16i8:
2887 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
2888 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2889 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2890 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2891 ; AVX2-NEXT: vzeroupper
2894 ; AVX512F-LABEL: trunc_and_v16i16_v16i8:
2896 ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
2897 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2898 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
2899 ; AVX512F-NEXT: vzeroupper
2900 ; AVX512F-NEXT: retq
2902 ; AVX512BW-LABEL: trunc_and_v16i16_v16i8:
2903 ; AVX512BW: # %bb.0:
2904 ; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0
2905 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
2906 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2907 ; AVX512BW-NEXT: vzeroupper
2908 ; AVX512BW-NEXT: retq
2910 ; AVX512DQ-LABEL: trunc_and_v16i16_v16i8:
2911 ; AVX512DQ: # %bb.0:
2912 ; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0
2913 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2914 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
2915 ; AVX512DQ-NEXT: vzeroupper
2916 ; AVX512DQ-NEXT: retq
2917 %1 = and <16 x i16> %a0, %a1
2918 %2 = trunc <16 x i16> %1 to <16 x i8>
2926 define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
2927 ; SSE-LABEL: trunc_and_const_v4i64_v4i32:
2929 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2930 ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2933 ; AVX1-LABEL: trunc_and_const_v4i64_v4i32:
2935 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2936 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2937 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2938 ; AVX1-NEXT: vzeroupper
2941 ; AVX2-SLOW-LABEL: trunc_and_const_v4i64_v4i32:
2942 ; AVX2-SLOW: # %bb.0:
2943 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
2944 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2945 ; AVX2-SLOW-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2946 ; AVX2-SLOW-NEXT: vzeroupper
2947 ; AVX2-SLOW-NEXT: retq
2949 ; AVX2-FAST-ALL-LABEL: trunc_and_const_v4i64_v4i32:
2950 ; AVX2-FAST-ALL: # %bb.0:
2951 ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
2952 ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
2953 ; AVX2-FAST-ALL-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2954 ; AVX2-FAST-ALL-NEXT: vzeroupper
2955 ; AVX2-FAST-ALL-NEXT: retq
2957 ; AVX2-FAST-PERLANE-LABEL: trunc_and_const_v4i64_v4i32:
2958 ; AVX2-FAST-PERLANE: # %bb.0:
2959 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
2960 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2961 ; AVX2-FAST-PERLANE-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2962 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
2963 ; AVX2-FAST-PERLANE-NEXT: retq
2965 ; AVX512-LABEL: trunc_and_const_v4i64_v4i32:
2967 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
2968 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
2969 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2970 ; AVX512-NEXT: vzeroupper
2972 %1 = and <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
2973 %2 = trunc <4 x i64> %1 to <4 x i32>
2977 define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
2978 ; SSE-LABEL: trunc_and_const_v8i64_v8i16:
2980 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2981 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
2982 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2983 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
2984 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
2985 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
2986 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
2987 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
2988 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
2989 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2990 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
2991 ; SSE-NEXT: andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2994 ; AVX1-LABEL: trunc_and_const_v8i64_v8i16:
2996 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
2997 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
2998 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
2999 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3000 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
3001 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3002 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
3003 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3004 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3005 ; AVX1-NEXT: vzeroupper
3008 ; AVX2-LABEL: trunc_and_const_v8i64_v8i16:
3010 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
3011 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
3012 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
3013 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
3014 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3015 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3016 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3017 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3018 ; AVX2-NEXT: vzeroupper
3021 ; AVX512-LABEL: trunc_and_const_v8i64_v8i16:
3023 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
3024 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3025 ; AVX512-NEXT: vzeroupper
3027 %1 = and <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
3028 %2 = trunc <8 x i64> %1 to <8 x i16>
3032 define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
3033 ; SSE-LABEL: trunc_and_const_v8i32_v8i16:
3035 ; SSE-NEXT: pslld $16, %xmm1
3036 ; SSE-NEXT: psrad $16, %xmm1
3037 ; SSE-NEXT: pslld $16, %xmm0
3038 ; SSE-NEXT: psrad $16, %xmm0
3039 ; SSE-NEXT: packssdw %xmm1, %xmm0
3040 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3043 ; AVX1-LABEL: trunc_and_const_v8i32_v8i16:
3045 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3046 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
3047 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
3048 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
3049 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3050 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3051 ; AVX1-NEXT: vzeroupper
3054 ; AVX2-LABEL: trunc_and_const_v8i32_v8i16:
3056 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
3057 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3058 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3059 ; AVX2-NEXT: vzeroupper
3062 ; AVX512-LABEL: trunc_and_const_v8i32_v8i16:
3064 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
3065 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
3066 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3067 ; AVX512-NEXT: vzeroupper
3069 %1 = and <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3070 %2 = trunc <8 x i32> %1 to <8 x i16>
3074 define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
3075 ; SSE-LABEL: trunc_and_const_v16i64_v16i8:
3077 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3078 ; SSE-NEXT: pand %xmm8, %xmm7
3079 ; SSE-NEXT: pand %xmm8, %xmm6
3080 ; SSE-NEXT: packuswb %xmm7, %xmm6
3081 ; SSE-NEXT: pand %xmm8, %xmm5
3082 ; SSE-NEXT: pand %xmm8, %xmm4
3083 ; SSE-NEXT: packuswb %xmm5, %xmm4
3084 ; SSE-NEXT: packuswb %xmm6, %xmm4
3085 ; SSE-NEXT: pand %xmm8, %xmm3
3086 ; SSE-NEXT: pand %xmm8, %xmm2
3087 ; SSE-NEXT: packuswb %xmm3, %xmm2
3088 ; SSE-NEXT: pand %xmm8, %xmm1
3089 ; SSE-NEXT: pand %xmm8, %xmm0
3090 ; SSE-NEXT: packuswb %xmm1, %xmm0
3091 ; SSE-NEXT: packuswb %xmm2, %xmm0
3092 ; SSE-NEXT: packuswb %xmm4, %xmm0
3093 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3096 ; AVX1-LABEL: trunc_and_const_v16i64_v16i8:
3098 ; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
3099 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
3100 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
3101 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
3102 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
3103 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
3104 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
3105 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
3106 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
3107 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3108 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3109 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
3110 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
3111 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
3112 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3113 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
3114 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3115 ; AVX1-NEXT: vzeroupper
3118 ; AVX2-LABEL: trunc_and_const_v16i64_v16i8:
3120 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
3121 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
3122 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
3123 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
3124 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
3125 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
3126 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
3127 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
3128 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
3129 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
3130 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3131 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3132 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3133 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3134 ; AVX2-NEXT: vzeroupper
3137 ; AVX512-LABEL: trunc_and_const_v16i64_v16i8:
3139 ; AVX512-NEXT: vpmovqb %zmm1, %xmm1
3140 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
3141 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3142 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3143 ; AVX512-NEXT: vzeroupper
3145 %1 = and <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
3146 %2 = trunc <16 x i64> %1 to <16 x i8>
3150 define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
3151 ; SSE-LABEL: trunc_and_const_v16i32_v16i8:
3153 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3154 ; SSE-NEXT: pand %xmm4, %xmm3
3155 ; SSE-NEXT: pand %xmm4, %xmm2
3156 ; SSE-NEXT: packuswb %xmm3, %xmm2
3157 ; SSE-NEXT: pand %xmm4, %xmm1
3158 ; SSE-NEXT: pand %xmm4, %xmm0
3159 ; SSE-NEXT: packuswb %xmm1, %xmm0
3160 ; SSE-NEXT: packuswb %xmm2, %xmm0
3161 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3164 ; AVX1-LABEL: trunc_and_const_v16i32_v16i8:
3166 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
3167 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
3168 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3169 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3170 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
3171 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3172 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
3173 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3174 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3175 ; AVX1-NEXT: vzeroupper
3178 ; AVX2-LABEL: trunc_and_const_v16i32_v16i8:
3180 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
3181 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
3182 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
3183 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
3184 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3185 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3186 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3187 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3188 ; AVX2-NEXT: vzeroupper
3191 ; AVX512-LABEL: trunc_and_const_v16i32_v16i8:
3193 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
3194 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3195 ; AVX512-NEXT: vzeroupper
3197 %1 = and <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3198 %2 = trunc <16 x i32> %1 to <16 x i8>
3202 define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
3203 ; SSE-LABEL: trunc_and_const_v16i16_v16i8:
3205 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
3206 ; SSE-NEXT: pand %xmm2, %xmm1
3207 ; SSE-NEXT: pand %xmm2, %xmm0
3208 ; SSE-NEXT: packuswb %xmm1, %xmm0
3209 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3212 ; AVX1-LABEL: trunc_and_const_v16i16_v16i8:
3214 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3215 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3216 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3217 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3218 ; AVX1-NEXT: vzeroupper
3221 ; AVX2-LABEL: trunc_and_const_v16i16_v16i8:
3223 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3224 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3225 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3226 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3227 ; AVX2-NEXT: vzeroupper
3230 ; AVX512F-LABEL: trunc_and_const_v16i16_v16i8:
3232 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3233 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
3234 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3235 ; AVX512F-NEXT: vzeroupper
3236 ; AVX512F-NEXT: retq
3238 ; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8:
3239 ; AVX512BW: # %bb.0:
3240 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
3241 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
3242 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3243 ; AVX512BW-NEXT: vzeroupper
3244 ; AVX512BW-NEXT: retq
3246 ; AVX512DQ-LABEL: trunc_and_const_v16i16_v16i8:
3247 ; AVX512DQ: # %bb.0:
3248 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3249 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
3250 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3251 ; AVX512DQ-NEXT: vzeroupper
3252 ; AVX512DQ-NEXT: retq
3253 %1 = and <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
3254 %2 = trunc <16 x i16> %1 to <16 x i8>
3262 define <4 x i32> @trunc_xor_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
3263 ; SSE-LABEL: trunc_xor_v4i64_v4i32:
3265 ; SSE-NEXT: xorps %xmm3, %xmm1
3266 ; SSE-NEXT: xorps %xmm2, %xmm0
3267 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3270 ; AVX1-LABEL: trunc_xor_v4i64_v4i32:
3272 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
3273 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3274 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3275 ; AVX1-NEXT: vzeroupper
3278 ; AVX2-SLOW-LABEL: trunc_xor_v4i64_v4i32:
3279 ; AVX2-SLOW: # %bb.0:
3280 ; AVX2-SLOW-NEXT: vxorps %ymm1, %ymm0, %ymm0
3281 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
3282 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3283 ; AVX2-SLOW-NEXT: vzeroupper
3284 ; AVX2-SLOW-NEXT: retq
3286 ; AVX2-FAST-ALL-LABEL: trunc_xor_v4i64_v4i32:
3287 ; AVX2-FAST-ALL: # %bb.0:
3288 ; AVX2-FAST-ALL-NEXT: vxorps %ymm1, %ymm0, %ymm0
3289 ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
3290 ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
3291 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3292 ; AVX2-FAST-ALL-NEXT: vzeroupper
3293 ; AVX2-FAST-ALL-NEXT: retq
3295 ; AVX2-FAST-PERLANE-LABEL: trunc_xor_v4i64_v4i32:
3296 ; AVX2-FAST-PERLANE: # %bb.0:
3297 ; AVX2-FAST-PERLANE-NEXT: vxorps %ymm1, %ymm0, %ymm0
3298 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
3299 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3300 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
3301 ; AVX2-FAST-PERLANE-NEXT: retq
3303 ; AVX512-LABEL: trunc_xor_v4i64_v4i32:
3305 ; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
3306 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
3307 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3308 ; AVX512-NEXT: vzeroupper
3310 %1 = xor <4 x i64> %a0, %a1
3311 %2 = trunc <4 x i64> %1 to <4 x i32>
3315 define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
3316 ; SSE-LABEL: trunc_xor_v8i64_v8i16:
3318 ; SSE-NEXT: pxor %xmm6, %xmm2
3319 ; SSE-NEXT: pxor %xmm7, %xmm3
3320 ; SSE-NEXT: pxor %xmm4, %xmm0
3321 ; SSE-NEXT: pxor %xmm5, %xmm1
3322 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3323 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
3324 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3325 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
3326 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3327 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
3328 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
3329 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
3330 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
3331 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3332 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
3335 ; AVX1-LABEL: trunc_xor_v8i64_v8i16:
3337 ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
3338 ; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1
3339 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
3340 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
3341 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3342 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3343 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
3344 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3345 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
3346 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3347 ; AVX1-NEXT: vzeroupper
3350 ; AVX2-LABEL: trunc_xor_v8i64_v8i16:
3352 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
3353 ; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
3354 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
3355 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
3356 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
3357 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
3358 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3359 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3360 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3361 ; AVX2-NEXT: vzeroupper
3364 ; AVX512-LABEL: trunc_xor_v8i64_v8i16:
3366 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
3367 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
3368 ; AVX512-NEXT: vzeroupper
3370 %1 = xor <8 x i64> %a0, %a1
3371 %2 = trunc <8 x i64> %1 to <8 x i16>
3375 define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
3376 ; SSE-LABEL: trunc_xor_v8i32_v8i16:
3378 ; SSE-NEXT: pxor %xmm2, %xmm0
3379 ; SSE-NEXT: pxor %xmm3, %xmm1
3380 ; SSE-NEXT: pslld $16, %xmm1
3381 ; SSE-NEXT: psrad $16, %xmm1
3382 ; SSE-NEXT: pslld $16, %xmm0
3383 ; SSE-NEXT: psrad $16, %xmm0
3384 ; SSE-NEXT: packssdw %xmm1, %xmm0
3387 ; AVX1-LABEL: trunc_xor_v8i32_v8i16:
3389 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
3390 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3391 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
3392 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
3393 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
3394 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3395 ; AVX1-NEXT: vzeroupper
3398 ; AVX2-LABEL: trunc_xor_v8i32_v8i16:
3400 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
3401 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
3402 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3403 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3404 ; AVX2-NEXT: vzeroupper
3407 ; AVX512-LABEL: trunc_xor_v8i32_v8i16:
3409 ; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
3410 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
3411 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3412 ; AVX512-NEXT: vzeroupper
3414 %1 = xor <8 x i32> %a0, %a1
3415 %2 = trunc <8 x i32> %1 to <8 x i16>
3419 define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
3420 ; SSE-LABEL: trunc_xor_v16i64_v16i8:
3422 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm0
3423 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm1
3424 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm2
3425 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm3
3426 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm4
3427 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm5
3428 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm6
3429 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm7
3430 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3431 ; SSE-NEXT: pand %xmm8, %xmm7
3432 ; SSE-NEXT: pand %xmm8, %xmm6
3433 ; SSE-NEXT: packuswb %xmm7, %xmm6
3434 ; SSE-NEXT: pand %xmm8, %xmm5
3435 ; SSE-NEXT: pand %xmm8, %xmm4
3436 ; SSE-NEXT: packuswb %xmm5, %xmm4
3437 ; SSE-NEXT: packuswb %xmm6, %xmm4
3438 ; SSE-NEXT: pand %xmm8, %xmm3
3439 ; SSE-NEXT: pand %xmm8, %xmm2
3440 ; SSE-NEXT: packuswb %xmm3, %xmm2
3441 ; SSE-NEXT: pand %xmm8, %xmm1
3442 ; SSE-NEXT: pand %xmm8, %xmm0
3443 ; SSE-NEXT: packuswb %xmm1, %xmm0
3444 ; SSE-NEXT: packuswb %xmm2, %xmm0
3445 ; SSE-NEXT: packuswb %xmm4, %xmm0
3448 ; AVX1-LABEL: trunc_xor_v16i64_v16i8:
3450 ; AVX1-NEXT: vxorps %ymm4, %ymm0, %ymm0
3451 ; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1
3452 ; AVX1-NEXT: vxorps %ymm6, %ymm2, %ymm2
3453 ; AVX1-NEXT: vxorps %ymm7, %ymm3, %ymm3
3454 ; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
3455 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
3456 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
3457 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
3458 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
3459 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
3460 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
3461 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
3462 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
3463 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3464 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3465 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
3466 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
3467 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
3468 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3469 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
3470 ; AVX1-NEXT: vzeroupper
3473 ; AVX2-LABEL: trunc_xor_v16i64_v16i8:
3475 ; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
3476 ; AVX2-NEXT: vpxor %ymm5, %ymm1, %ymm1
3477 ; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2
3478 ; AVX2-NEXT: vpxor %ymm7, %ymm3, %ymm3
3479 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
3480 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
3481 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
3482 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
3483 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
3484 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
3485 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
3486 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
3487 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
3488 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
3489 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3490 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3491 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3492 ; AVX2-NEXT: vzeroupper
3495 ; AVX512-LABEL: trunc_xor_v16i64_v16i8:
3497 ; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0
3498 ; AVX512-NEXT: vpxorq %zmm3, %zmm1, %zmm1
3499 ; AVX512-NEXT: vpmovqb %zmm1, %xmm1
3500 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
3501 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3502 ; AVX512-NEXT: vzeroupper
3504 %1 = xor <16 x i64> %a0, %a1
3505 %2 = trunc <16 x i64> %1 to <16 x i8>
3509 define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
3510 ; SSE-LABEL: trunc_xor_v16i32_v16i8:
3512 ; SSE-NEXT: pxor %xmm4, %xmm0
3513 ; SSE-NEXT: pxor %xmm5, %xmm1
3514 ; SSE-NEXT: pxor %xmm6, %xmm2
3515 ; SSE-NEXT: pxor %xmm7, %xmm3
3516 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3517 ; SSE-NEXT: pand %xmm4, %xmm3
3518 ; SSE-NEXT: pand %xmm4, %xmm2
3519 ; SSE-NEXT: packuswb %xmm3, %xmm2
3520 ; SSE-NEXT: pand %xmm4, %xmm1
3521 ; SSE-NEXT: pand %xmm4, %xmm0
3522 ; SSE-NEXT: packuswb %xmm1, %xmm0
3523 ; SSE-NEXT: packuswb %xmm2, %xmm0
3526 ; AVX1-LABEL: trunc_xor_v16i32_v16i8:
3528 ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
3529 ; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1
3530 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
3531 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
3532 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3533 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3534 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
3535 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3536 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
3537 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3538 ; AVX1-NEXT: vzeroupper
3541 ; AVX2-LABEL: trunc_xor_v16i32_v16i8:
3543 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
3544 ; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
3545 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
3546 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
3547 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
3548 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
3549 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3550 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3551 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3552 ; AVX2-NEXT: vzeroupper
3555 ; AVX512-LABEL: trunc_xor_v16i32_v16i8:
3557 ; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0
3558 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
3559 ; AVX512-NEXT: vzeroupper
3561 %1 = xor <16 x i32> %a0, %a1
3562 %2 = trunc <16 x i32> %1 to <16 x i8>
3566 define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
3567 ; SSE-LABEL: trunc_xor_v16i16_v16i8:
3569 ; SSE-NEXT: pxor %xmm2, %xmm0
3570 ; SSE-NEXT: pxor %xmm3, %xmm1
3571 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
3572 ; SSE-NEXT: pand %xmm2, %xmm1
3573 ; SSE-NEXT: pand %xmm2, %xmm0
3574 ; SSE-NEXT: packuswb %xmm1, %xmm0
3577 ; AVX1-LABEL: trunc_xor_v16i16_v16i8:
3579 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
3580 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3581 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3582 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3583 ; AVX1-NEXT: vzeroupper
3586 ; AVX2-LABEL: trunc_xor_v16i16_v16i8:
3588 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
3589 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3590 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3591 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3592 ; AVX2-NEXT: vzeroupper
3595 ; AVX512F-LABEL: trunc_xor_v16i16_v16i8:
3597 ; AVX512F-NEXT: vpxor %ymm1, %ymm0, %ymm0
3598 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3599 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
3600 ; AVX512F-NEXT: vzeroupper
3601 ; AVX512F-NEXT: retq
3603 ; AVX512BW-LABEL: trunc_xor_v16i16_v16i8:
3604 ; AVX512BW: # %bb.0:
3605 ; AVX512BW-NEXT: vpxor %ymm1, %ymm0, %ymm0
3606 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
3607 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3608 ; AVX512BW-NEXT: vzeroupper
3609 ; AVX512BW-NEXT: retq
3611 ; AVX512DQ-LABEL: trunc_xor_v16i16_v16i8:
3612 ; AVX512DQ: # %bb.0:
3613 ; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0
3614 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3615 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
3616 ; AVX512DQ-NEXT: vzeroupper
3617 ; AVX512DQ-NEXT: retq
3618 %1 = xor <16 x i16> %a0, %a1
3619 %2 = trunc <16 x i16> %1 to <16 x i8>
3627 define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
3628 ; SSE-LABEL: trunc_xor_const_v4i64_v4i32:
3630 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3631 ; SSE-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3634 ; AVX1-LABEL: trunc_xor_const_v4i64_v4i32:
3636 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3637 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3638 ; AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3639 ; AVX1-NEXT: vzeroupper
3642 ; AVX2-SLOW-LABEL: trunc_xor_const_v4i64_v4i32:
3643 ; AVX2-SLOW: # %bb.0:
3644 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
3645 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3646 ; AVX2-SLOW-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3647 ; AVX2-SLOW-NEXT: vzeroupper
3648 ; AVX2-SLOW-NEXT: retq
3650 ; AVX2-FAST-ALL-LABEL: trunc_xor_const_v4i64_v4i32:
3651 ; AVX2-FAST-ALL: # %bb.0:
3652 ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
3653 ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
3654 ; AVX2-FAST-ALL-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3655 ; AVX2-FAST-ALL-NEXT: vzeroupper
3656 ; AVX2-FAST-ALL-NEXT: retq
3658 ; AVX2-FAST-PERLANE-LABEL: trunc_xor_const_v4i64_v4i32:
3659 ; AVX2-FAST-PERLANE: # %bb.0:
3660 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
3661 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3662 ; AVX2-FAST-PERLANE-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3663 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
3664 ; AVX2-FAST-PERLANE-NEXT: retq
3666 ; AVX512-LABEL: trunc_xor_const_v4i64_v4i32:
3668 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
3669 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
3670 ; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3671 ; AVX512-NEXT: vzeroupper
3673 %1 = xor <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
3674 %2 = trunc <4 x i64> %1 to <4 x i32>
3678 define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
3679 ; SSE-LABEL: trunc_xor_const_v8i64_v8i16:
3681 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3682 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
3683 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3684 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
3685 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3686 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
3687 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
3688 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
3689 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
3690 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3691 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
3692 ; SSE-NEXT: xorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3695 ; AVX1-LABEL: trunc_xor_const_v8i64_v8i16:
3697 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
3698 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
3699 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3700 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3701 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
3702 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3703 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
3704 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3705 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3706 ; AVX1-NEXT: vzeroupper
3709 ; AVX2-LABEL: trunc_xor_const_v8i64_v8i16:
3711 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
3712 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
3713 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
3714 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
3715 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3716 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3717 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3718 ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3719 ; AVX2-NEXT: vzeroupper
3722 ; AVX512-LABEL: trunc_xor_const_v8i64_v8i16:
3724 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
3725 ; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3726 ; AVX512-NEXT: vzeroupper
3728 %1 = xor <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
3729 %2 = trunc <8 x i64> %1 to <8 x i16>
3733 define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
3734 ; SSE-LABEL: trunc_xor_const_v8i32_v8i16:
3736 ; SSE-NEXT: pslld $16, %xmm1
3737 ; SSE-NEXT: psrad $16, %xmm1
3738 ; SSE-NEXT: pslld $16, %xmm0
3739 ; SSE-NEXT: psrad $16, %xmm0
3740 ; SSE-NEXT: packssdw %xmm1, %xmm0
3741 ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3744 ; AVX1-LABEL: trunc_xor_const_v8i32_v8i16:
3746 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3747 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
3748 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
3749 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
3750 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3751 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3752 ; AVX1-NEXT: vzeroupper
3755 ; AVX2-LABEL: trunc_xor_const_v8i32_v8i16:
3757 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
3758 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3759 ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3760 ; AVX2-NEXT: vzeroupper
3763 ; AVX512-LABEL: trunc_xor_const_v8i32_v8i16:
3765 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
3766 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
3767 ; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3768 ; AVX512-NEXT: vzeroupper
3770 %1 = xor <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3771 %2 = trunc <8 x i32> %1 to <8 x i16>
3775 define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
3776 ; SSE-LABEL: trunc_xor_const_v16i64_v16i8:
3778 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3779 ; SSE-NEXT: pand %xmm8, %xmm7
3780 ; SSE-NEXT: pand %xmm8, %xmm6
3781 ; SSE-NEXT: packuswb %xmm7, %xmm6
3782 ; SSE-NEXT: pand %xmm8, %xmm5
3783 ; SSE-NEXT: pand %xmm8, %xmm4
3784 ; SSE-NEXT: packuswb %xmm5, %xmm4
3785 ; SSE-NEXT: packuswb %xmm6, %xmm4
3786 ; SSE-NEXT: pand %xmm8, %xmm3
3787 ; SSE-NEXT: pand %xmm8, %xmm2
3788 ; SSE-NEXT: packuswb %xmm3, %xmm2
3789 ; SSE-NEXT: pand %xmm8, %xmm1
3790 ; SSE-NEXT: pand %xmm8, %xmm0
3791 ; SSE-NEXT: packuswb %xmm1, %xmm0
3792 ; SSE-NEXT: packuswb %xmm2, %xmm0
3793 ; SSE-NEXT: packuswb %xmm4, %xmm0
3794 ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3797 ; AVX1-LABEL: trunc_xor_const_v16i64_v16i8:
3799 ; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
3800 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
3801 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
3802 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
3803 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
3804 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
3805 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
3806 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
3807 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
3808 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3809 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3810 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
3811 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
3812 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
3813 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3814 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
3815 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3816 ; AVX1-NEXT: vzeroupper
3819 ; AVX2-LABEL: trunc_xor_const_v16i64_v16i8:
3821 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
3822 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
3823 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
3824 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
3825 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
3826 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
3827 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
3828 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
3829 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
3830 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
3831 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3832 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3833 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3834 ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3835 ; AVX2-NEXT: vzeroupper
3838 ; AVX512-LABEL: trunc_xor_const_v16i64_v16i8:
3840 ; AVX512-NEXT: vpmovqb %zmm1, %xmm1
3841 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
3842 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3843 ; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3844 ; AVX512-NEXT: vzeroupper
3846 %1 = xor <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
3847 %2 = trunc <16 x i64> %1 to <16 x i8>
3851 define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
3852 ; SSE-LABEL: trunc_xor_const_v16i32_v16i8:
3854 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3855 ; SSE-NEXT: pand %xmm4, %xmm3
3856 ; SSE-NEXT: pand %xmm4, %xmm2
3857 ; SSE-NEXT: packuswb %xmm3, %xmm2
3858 ; SSE-NEXT: pand %xmm4, %xmm1
3859 ; SSE-NEXT: pand %xmm4, %xmm0
3860 ; SSE-NEXT: packuswb %xmm1, %xmm0
3861 ; SSE-NEXT: packuswb %xmm2, %xmm0
3862 ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3865 ; AVX1-LABEL: trunc_xor_const_v16i32_v16i8:
3867 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
3868 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
3869 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3870 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
3871 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
3872 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3873 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
3874 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3875 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3876 ; AVX1-NEXT: vzeroupper
3879 ; AVX2-LABEL: trunc_xor_const_v16i32_v16i8:
3881 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
3882 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
3883 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
3884 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
3885 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3886 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3887 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3888 ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3889 ; AVX2-NEXT: vzeroupper
3892 ; AVX512-LABEL: trunc_xor_const_v16i32_v16i8:
3894 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
3895 ; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3896 ; AVX512-NEXT: vzeroupper
3898 %1 = xor <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3899 %2 = trunc <16 x i32> %1 to <16 x i8>
3903 define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
3904 ; SSE-LABEL: trunc_xor_const_v16i16_v16i8:
3906 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
3907 ; SSE-NEXT: pand %xmm2, %xmm1
3908 ; SSE-NEXT: pand %xmm2, %xmm0
3909 ; SSE-NEXT: packuswb %xmm1, %xmm0
3910 ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3913 ; AVX1-LABEL: trunc_xor_const_v16i16_v16i8:
3915 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3916 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3917 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3918 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3919 ; AVX1-NEXT: vzeroupper
3922 ; AVX2-LABEL: trunc_xor_const_v16i16_v16i8:
3924 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3925 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3926 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3927 ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3928 ; AVX2-NEXT: vzeroupper
3931 ; AVX512F-LABEL: trunc_xor_const_v16i16_v16i8:
3933 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3934 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
3935 ; AVX512F-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3936 ; AVX512F-NEXT: vzeroupper
3937 ; AVX512F-NEXT: retq
3939 ; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8:
3940 ; AVX512BW: # %bb.0:
3941 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
3942 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
3943 ; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3944 ; AVX512BW-NEXT: vzeroupper
3945 ; AVX512BW-NEXT: retq
3947 ; AVX512DQ-LABEL: trunc_xor_const_v16i16_v16i8:
3948 ; AVX512DQ: # %bb.0:
3949 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3950 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
3951 ; AVX512DQ-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3952 ; AVX512DQ-NEXT: vzeroupper
3953 ; AVX512DQ-NEXT: retq
3954 %1 = xor <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
3955 %2 = trunc <16 x i16> %1 to <16 x i8>
3963 define <4 x i32> @trunc_or_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
3964 ; SSE-LABEL: trunc_or_v4i64_v4i32:
3966 ; SSE-NEXT: orps %xmm3, %xmm1
3967 ; SSE-NEXT: orps %xmm2, %xmm0
3968 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3971 ; AVX1-LABEL: trunc_or_v4i64_v4i32:
3973 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
3974 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3975 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3976 ; AVX1-NEXT: vzeroupper
3979 ; AVX2-SLOW-LABEL: trunc_or_v4i64_v4i32:
3980 ; AVX2-SLOW: # %bb.0:
3981 ; AVX2-SLOW-NEXT: vorps %ymm1, %ymm0, %ymm0
3982 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
3983 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3984 ; AVX2-SLOW-NEXT: vzeroupper
3985 ; AVX2-SLOW-NEXT: retq
3987 ; AVX2-FAST-ALL-LABEL: trunc_or_v4i64_v4i32:
3988 ; AVX2-FAST-ALL: # %bb.0:
3989 ; AVX2-FAST-ALL-NEXT: vorps %ymm1, %ymm0, %ymm0
3990 ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
3991 ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
3992 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3993 ; AVX2-FAST-ALL-NEXT: vzeroupper
3994 ; AVX2-FAST-ALL-NEXT: retq
3996 ; AVX2-FAST-PERLANE-LABEL: trunc_or_v4i64_v4i32:
3997 ; AVX2-FAST-PERLANE: # %bb.0:
3998 ; AVX2-FAST-PERLANE-NEXT: vorps %ymm1, %ymm0, %ymm0
3999 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
4000 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4001 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
4002 ; AVX2-FAST-PERLANE-NEXT: retq
4004 ; AVX512-LABEL: trunc_or_v4i64_v4i32:
4006 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
4007 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
4008 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
4009 ; AVX512-NEXT: vzeroupper
4011 %1 = or <4 x i64> %a0, %a1
4012 %2 = trunc <4 x i64> %1 to <4 x i32>
4016 define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
4017 ; SSE-LABEL: trunc_or_v8i64_v8i16:
4019 ; SSE-NEXT: por %xmm6, %xmm2
4020 ; SSE-NEXT: por %xmm7, %xmm3
4021 ; SSE-NEXT: por %xmm4, %xmm0
4022 ; SSE-NEXT: por %xmm5, %xmm1
4023 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4024 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
4025 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4026 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
4027 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
4028 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
4029 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
4030 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
4031 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
4032 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4033 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
4036 ; AVX1-LABEL: trunc_or_v8i64_v8i16:
4038 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
4039 ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
4040 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
4041 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
4042 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
4043 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
4044 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
4045 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
4046 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
4047 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
4048 ; AVX1-NEXT: vzeroupper
4051 ; AVX2-LABEL: trunc_or_v8i64_v8i16:
4053 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
4054 ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
4055 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
4056 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
4057 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
4058 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
4059 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4060 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
4061 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4062 ; AVX2-NEXT: vzeroupper
4065 ; AVX512-LABEL: trunc_or_v8i64_v8i16:
4067 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
4068 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
4069 ; AVX512-NEXT: vzeroupper
4071 %1 = or <8 x i64> %a0, %a1
4072 %2 = trunc <8 x i64> %1 to <8 x i16>
4076 define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
4077 ; SSE-LABEL: trunc_or_v8i32_v8i16:
4079 ; SSE-NEXT: por %xmm2, %xmm0
4080 ; SSE-NEXT: por %xmm3, %xmm1
4081 ; SSE-NEXT: pslld $16, %xmm1
4082 ; SSE-NEXT: psrad $16, %xmm1
4083 ; SSE-NEXT: pslld $16, %xmm0
4084 ; SSE-NEXT: psrad $16, %xmm0
4085 ; SSE-NEXT: packssdw %xmm1, %xmm0
4088 ; AVX1-LABEL: trunc_or_v8i32_v8i16:
4090 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
4091 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4092 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
4093 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
4094 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
4095 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4096 ; AVX1-NEXT: vzeroupper
4099 ; AVX2-LABEL: trunc_or_v8i32_v8i16:
4101 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
4102 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
4103 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4104 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
4105 ; AVX2-NEXT: vzeroupper
4108 ; AVX512-LABEL: trunc_or_v8i32_v8i16:
4110 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
4111 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
4112 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
4113 ; AVX512-NEXT: vzeroupper
4115 %1 = or <8 x i32> %a0, %a1
4116 %2 = trunc <8 x i32> %1 to <8 x i16>
4120 define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
4121 ; SSE-LABEL: trunc_or_v16i64_v16i8:
4123 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm0
4124 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm1
4125 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm2
4126 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm3
4127 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm4
4128 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm5
4129 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm6
4130 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm7
4131 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4132 ; SSE-NEXT: pand %xmm8, %xmm7
4133 ; SSE-NEXT: pand %xmm8, %xmm6
4134 ; SSE-NEXT: packuswb %xmm7, %xmm6
4135 ; SSE-NEXT: pand %xmm8, %xmm5
4136 ; SSE-NEXT: pand %xmm8, %xmm4
4137 ; SSE-NEXT: packuswb %xmm5, %xmm4
4138 ; SSE-NEXT: packuswb %xmm6, %xmm4
4139 ; SSE-NEXT: pand %xmm8, %xmm3
4140 ; SSE-NEXT: pand %xmm8, %xmm2
4141 ; SSE-NEXT: packuswb %xmm3, %xmm2
4142 ; SSE-NEXT: pand %xmm8, %xmm1
4143 ; SSE-NEXT: pand %xmm8, %xmm0
4144 ; SSE-NEXT: packuswb %xmm1, %xmm0
4145 ; SSE-NEXT: packuswb %xmm2, %xmm0
4146 ; SSE-NEXT: packuswb %xmm4, %xmm0
4149 ; AVX1-LABEL: trunc_or_v16i64_v16i8:
4151 ; AVX1-NEXT: vorps %ymm4, %ymm0, %ymm0
4152 ; AVX1-NEXT: vorps %ymm5, %ymm1, %ymm1
4153 ; AVX1-NEXT: vorps %ymm6, %ymm2, %ymm2
4154 ; AVX1-NEXT: vorps %ymm7, %ymm3, %ymm3
4155 ; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
4156 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
4157 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
4158 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
4159 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
4160 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
4161 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
4162 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4163 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
4164 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
4165 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
4166 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
4167 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
4168 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
4169 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
4170 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
4171 ; AVX1-NEXT: vzeroupper
4174 ; AVX2-LABEL: trunc_or_v16i64_v16i8:
4176 ; AVX2-NEXT: vpor %ymm4, %ymm0, %ymm0
4177 ; AVX2-NEXT: vpor %ymm5, %ymm1, %ymm1
4178 ; AVX2-NEXT: vpor %ymm6, %ymm2, %ymm2
4179 ; AVX2-NEXT: vpor %ymm7, %ymm3, %ymm3
4180 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
4181 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
4182 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
4183 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
4184 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
4185 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
4186 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
4187 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
4188 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
4189 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
4190 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4191 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4192 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4193 ; AVX2-NEXT: vzeroupper
4196 ; AVX512-LABEL: trunc_or_v16i64_v16i8:
4198 ; AVX512-NEXT: vporq %zmm2, %zmm0, %zmm0
4199 ; AVX512-NEXT: vporq %zmm3, %zmm1, %zmm1
4200 ; AVX512-NEXT: vpmovqb %zmm1, %xmm1
4201 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
4202 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4203 ; AVX512-NEXT: vzeroupper
4205 %1 = or <16 x i64> %a0, %a1
4206 %2 = trunc <16 x i64> %1 to <16 x i8>
4210 define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
4211 ; SSE-LABEL: trunc_or_v16i32_v16i8:
4213 ; SSE-NEXT: por %xmm4, %xmm0
4214 ; SSE-NEXT: por %xmm5, %xmm1
4215 ; SSE-NEXT: por %xmm6, %xmm2
4216 ; SSE-NEXT: por %xmm7, %xmm3
4217 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4218 ; SSE-NEXT: pand %xmm4, %xmm3
4219 ; SSE-NEXT: pand %xmm4, %xmm2
4220 ; SSE-NEXT: packuswb %xmm3, %xmm2
4221 ; SSE-NEXT: pand %xmm4, %xmm1
4222 ; SSE-NEXT: pand %xmm4, %xmm0
4223 ; SSE-NEXT: packuswb %xmm1, %xmm0
4224 ; SSE-NEXT: packuswb %xmm2, %xmm0
4227 ; AVX1-LABEL: trunc_or_v16i32_v16i8:
4229 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
4230 ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
4231 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
4232 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
4233 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
4234 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
4235 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
4236 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
4237 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
4238 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4239 ; AVX1-NEXT: vzeroupper
4242 ; AVX2-LABEL: trunc_or_v16i32_v16i8:
4244 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
4245 ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
4246 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
4247 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
4248 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
4249 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
4250 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4251 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4252 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4253 ; AVX2-NEXT: vzeroupper
4256 ; AVX512-LABEL: trunc_or_v16i32_v16i8:
4258 ; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0
4259 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
4260 ; AVX512-NEXT: vzeroupper
4262 %1 = or <16 x i32> %a0, %a1
4263 %2 = trunc <16 x i32> %1 to <16 x i8>
4267 define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
4268 ; SSE-LABEL: trunc_or_v16i16_v16i8:
4270 ; SSE-NEXT: por %xmm2, %xmm0
4271 ; SSE-NEXT: por %xmm3, %xmm1
4272 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
4273 ; SSE-NEXT: pand %xmm2, %xmm1
4274 ; SSE-NEXT: pand %xmm2, %xmm0
4275 ; SSE-NEXT: packuswb %xmm1, %xmm0
4278 ; AVX1-LABEL: trunc_or_v16i16_v16i8:
4280 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
4281 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4282 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4283 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4284 ; AVX1-NEXT: vzeroupper
4287 ; AVX2-LABEL: trunc_or_v16i16_v16i8:
4289 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
4290 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4291 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4292 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4293 ; AVX2-NEXT: vzeroupper
4296 ; AVX512F-LABEL: trunc_or_v16i16_v16i8:
4298 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
4299 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4300 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
4301 ; AVX512F-NEXT: vzeroupper
4302 ; AVX512F-NEXT: retq
4304 ; AVX512BW-LABEL: trunc_or_v16i16_v16i8:
4305 ; AVX512BW: # %bb.0:
4306 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
4307 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
4308 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
4309 ; AVX512BW-NEXT: vzeroupper
4310 ; AVX512BW-NEXT: retq
4312 ; AVX512DQ-LABEL: trunc_or_v16i16_v16i8:
4313 ; AVX512DQ: # %bb.0:
4314 ; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0
4315 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4316 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
4317 ; AVX512DQ-NEXT: vzeroupper
4318 ; AVX512DQ-NEXT: retq
4319 %1 = or <16 x i16> %a0, %a1
4320 %2 = trunc <16 x i16> %1 to <16 x i8>
4328 define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
4329 ; SSE-LABEL: trunc_or_const_v4i64_v4i32:
4331 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4332 ; SSE-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4335 ; AVX1-LABEL: trunc_or_const_v4i64_v4i32:
4337 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4338 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4339 ; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4340 ; AVX1-NEXT: vzeroupper
4343 ; AVX2-SLOW-LABEL: trunc_or_const_v4i64_v4i32:
4344 ; AVX2-SLOW: # %bb.0:
4345 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
4346 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4347 ; AVX2-SLOW-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4348 ; AVX2-SLOW-NEXT: vzeroupper
4349 ; AVX2-SLOW-NEXT: retq
4351 ; AVX2-FAST-ALL-LABEL: trunc_or_const_v4i64_v4i32:
4352 ; AVX2-FAST-ALL: # %bb.0:
4353 ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
4354 ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
4355 ; AVX2-FAST-ALL-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4356 ; AVX2-FAST-ALL-NEXT: vzeroupper
4357 ; AVX2-FAST-ALL-NEXT: retq
4359 ; AVX2-FAST-PERLANE-LABEL: trunc_or_const_v4i64_v4i32:
4360 ; AVX2-FAST-PERLANE: # %bb.0:
4361 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
4362 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4363 ; AVX2-FAST-PERLANE-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4364 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
4365 ; AVX2-FAST-PERLANE-NEXT: retq
4367 ; AVX512-LABEL: trunc_or_const_v4i64_v4i32:
4369 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
4370 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
4371 ; AVX512-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4372 ; AVX512-NEXT: vzeroupper
4374 %1 = or <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
4375 %2 = trunc <4 x i64> %1 to <4 x i32>
4379 define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
4380 ; SSE-LABEL: trunc_or_const_v8i64_v8i16:
4382 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4383 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
4384 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4385 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
4386 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
4387 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
4388 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
4389 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
4390 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
4391 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4392 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
4393 ; SSE-NEXT: orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4396 ; AVX1-LABEL: trunc_or_const_v8i64_v8i16:
4398 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
4399 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
4400 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
4401 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
4402 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
4403 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
4404 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
4405 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
4406 ; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4407 ; AVX1-NEXT: vzeroupper
4410 ; AVX2-LABEL: trunc_or_const_v8i64_v8i16:
4412 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
4413 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
4414 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
4415 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
4416 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4417 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
4418 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4419 ; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4420 ; AVX2-NEXT: vzeroupper
4423 ; AVX512-LABEL: trunc_or_const_v8i64_v8i16:
4425 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
4426 ; AVX512-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4427 ; AVX512-NEXT: vzeroupper
4429 %1 = or <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
4430 %2 = trunc <8 x i64> %1 to <8 x i16>
4434 define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
4435 ; SSE-LABEL: trunc_or_const_v8i32_v8i16:
4437 ; SSE-NEXT: pslld $16, %xmm1
4438 ; SSE-NEXT: psrad $16, %xmm1
4439 ; SSE-NEXT: pslld $16, %xmm0
4440 ; SSE-NEXT: psrad $16, %xmm0
4441 ; SSE-NEXT: packssdw %xmm1, %xmm0
4442 ; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4445 ; AVX1-LABEL: trunc_or_const_v8i32_v8i16:
4447 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4448 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
4449 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
4450 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
4451 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4452 ; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4453 ; AVX1-NEXT: vzeroupper
4456 ; AVX2-LABEL: trunc_or_const_v8i32_v8i16:
4458 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
4459 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4460 ; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4461 ; AVX2-NEXT: vzeroupper
4464 ; AVX512-LABEL: trunc_or_const_v8i32_v8i16:
4466 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
4467 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
4468 ; AVX512-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4469 ; AVX512-NEXT: vzeroupper
4471 %1 = or <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
4472 %2 = trunc <8 x i32> %1 to <8 x i16>
4476 define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
4477 ; SSE-LABEL: trunc_or_const_v16i64_v16i8:
4479 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4480 ; SSE-NEXT: pand %xmm8, %xmm7
4481 ; SSE-NEXT: pand %xmm8, %xmm6
4482 ; SSE-NEXT: packuswb %xmm7, %xmm6
4483 ; SSE-NEXT: pand %xmm8, %xmm5
4484 ; SSE-NEXT: pand %xmm8, %xmm4
4485 ; SSE-NEXT: packuswb %xmm5, %xmm4
4486 ; SSE-NEXT: packuswb %xmm6, %xmm4
4487 ; SSE-NEXT: pand %xmm8, %xmm3
4488 ; SSE-NEXT: pand %xmm8, %xmm2
4489 ; SSE-NEXT: packuswb %xmm3, %xmm2
4490 ; SSE-NEXT: pand %xmm8, %xmm1
4491 ; SSE-NEXT: pand %xmm8, %xmm0
4492 ; SSE-NEXT: packuswb %xmm1, %xmm0
4493 ; SSE-NEXT: packuswb %xmm2, %xmm0
4494 ; SSE-NEXT: packuswb %xmm4, %xmm0
4495 ; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4498 ; AVX1-LABEL: trunc_or_const_v16i64_v16i8:
4500 ; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
4501 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
4502 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
4503 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
4504 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
4505 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
4506 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
4507 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4508 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
4509 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
4510 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
4511 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
4512 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
4513 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
4514 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
4515 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
4516 ; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4517 ; AVX1-NEXT: vzeroupper
4520 ; AVX2-LABEL: trunc_or_const_v16i64_v16i8:
4522 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
4523 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
4524 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
4525 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
4526 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
4527 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
4528 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
4529 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
4530 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
4531 ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
4532 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4533 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4534 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4535 ; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4536 ; AVX2-NEXT: vzeroupper
4539 ; AVX512-LABEL: trunc_or_const_v16i64_v16i8:
4541 ; AVX512-NEXT: vpmovqb %zmm1, %xmm1
4542 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
4543 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4544 ; AVX512-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4545 ; AVX512-NEXT: vzeroupper
4547 %1 = or <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
4548 %2 = trunc <16 x i64> %1 to <16 x i8>
4552 define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
4553 ; SSE-LABEL: trunc_or_const_v16i32_v16i8:
4555 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4556 ; SSE-NEXT: pand %xmm4, %xmm3
4557 ; SSE-NEXT: pand %xmm4, %xmm2
4558 ; SSE-NEXT: packuswb %xmm3, %xmm2
4559 ; SSE-NEXT: pand %xmm4, %xmm1
4560 ; SSE-NEXT: pand %xmm4, %xmm0
4561 ; SSE-NEXT: packuswb %xmm1, %xmm0
4562 ; SSE-NEXT: packuswb %xmm2, %xmm0
4563 ; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4566 ; AVX1-LABEL: trunc_or_const_v16i32_v16i8:
4568 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
4569 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
4570 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
4571 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
4572 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
4573 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
4574 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
4575 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4576 ; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4577 ; AVX1-NEXT: vzeroupper
4580 ; AVX2-LABEL: trunc_or_const_v16i32_v16i8:
4582 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
4583 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
4584 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
4585 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
4586 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4587 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4588 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4589 ; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4590 ; AVX2-NEXT: vzeroupper
4593 ; AVX512-LABEL: trunc_or_const_v16i32_v16i8:
4595 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
4596 ; AVX512-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4597 ; AVX512-NEXT: vzeroupper
4599 %1 = or <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
4600 %2 = trunc <16 x i32> %1 to <16 x i8>
4604 define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
4605 ; SSE-LABEL: trunc_or_const_v16i16_v16i8:
4607 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
4608 ; SSE-NEXT: pand %xmm2, %xmm1
4609 ; SSE-NEXT: pand %xmm2, %xmm0
4610 ; SSE-NEXT: packuswb %xmm1, %xmm0
4611 ; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4614 ; AVX1-LABEL: trunc_or_const_v16i16_v16i8:
4616 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4617 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4618 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4619 ; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4620 ; AVX1-NEXT: vzeroupper
4623 ; AVX2-LABEL: trunc_or_const_v16i16_v16i8:
4625 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4626 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4627 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4628 ; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4629 ; AVX2-NEXT: vzeroupper
4632 ; AVX512F-LABEL: trunc_or_const_v16i16_v16i8:
4634 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4635 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
4636 ; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4637 ; AVX512F-NEXT: vzeroupper
4638 ; AVX512F-NEXT: retq
4640 ; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8:
4641 ; AVX512BW: # %bb.0:
4642 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
4643 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
4644 ; AVX512BW-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4645 ; AVX512BW-NEXT: vzeroupper
4646 ; AVX512BW-NEXT: retq
4648 ; AVX512DQ-LABEL: trunc_or_const_v16i16_v16i8:
4649 ; AVX512DQ: # %bb.0:
4650 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4651 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
4652 ; AVX512DQ-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4653 ; AVX512DQ-NEXT: vzeroupper
4654 ; AVX512DQ-NEXT: retq
4655 %1 = or <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
4656 %2 = trunc <16 x i16> %1 to <16 x i8>
4661 ; complex patterns - often created by vectorizer
4664 define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
4665 ; SSE-LABEL: mul_add_const_v4i64_v4i32:
4667 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
4668 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,3,3]
4669 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
4670 ; SSE-NEXT: pmuludq %xmm2, %xmm0
4671 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,3,3]
4672 ; SSE-NEXT: pmuludq %xmm3, %xmm1
4673 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4674 ; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4677 ; AVX-LABEL: mul_add_const_v4i64_v4i32:
4679 ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
4680 ; AVX-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4682 %1 = sext <4 x i32> %a0 to <4 x i64>
4683 %2 = sext <4 x i32> %a1 to <4 x i64>
4684 %3 = mul <4 x i64> %1, %2
4685 %4 = add <4 x i64> %3, <i64 -3, i64 -1, i64 1, i64 3>
4686 %5 = trunc <4 x i64> %4 to <4 x i32>
4690 define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
4691 ; SSE-LABEL: mul_add_self_v4i64_v4i32:
4693 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
4694 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,3,3]
4695 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
4696 ; SSE-NEXT: pmuludq %xmm2, %xmm0
4697 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,3,3]
4698 ; SSE-NEXT: pmuludq %xmm3, %xmm1
4699 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4700 ; SSE-NEXT: paddd %xmm0, %xmm0
4703 ; AVX-LABEL: mul_add_self_v4i64_v4i32:
4705 ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
4706 ; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0
4708 %1 = sext <4 x i32> %a0 to <4 x i64>
4709 %2 = sext <4 x i32> %a1 to <4 x i64>
4710 %3 = mul <4 x i64> %1, %2
4711 %4 = add <4 x i64> %3, %3
4712 %5 = trunc <4 x i64> %4 to <4 x i32>
4716 define <4 x i32> @mul_add_multiuse_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
4717 ; SSE-LABEL: mul_add_multiuse_v4i64_v4i32:
4719 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
4720 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,3,3]
4721 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3]
4722 ; SSE-NEXT: pmuludq %xmm2, %xmm4
4723 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,3,3]
4724 ; SSE-NEXT: pmuludq %xmm3, %xmm1
4725 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[0,2]
4726 ; SSE-NEXT: paddd %xmm4, %xmm0
4729 ; AVX-LABEL: mul_add_multiuse_v4i64_v4i32:
4731 ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm1
4732 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
4734 %1 = sext <4 x i32> %a0 to <4 x i64>
4735 %2 = sext <4 x i32> %a1 to <4 x i64>
4736 %3 = mul <4 x i64> %1, %2
4737 %4 = add <4 x i64> %1, %3
4738 %5 = trunc <4 x i64> %4 to <4 x i32>