1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
13 define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) {
14 ; SSE-LABEL: trunc8i64_8i32:
15 ; SSE: # %bb.0: # %entry
16 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
17 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
18 ; SSE-NEXT: movaps %xmm2, %xmm1
21 ; AVX1-LABEL: trunc8i64_8i32:
22 ; AVX1: # %bb.0: # %entry
23 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
24 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
25 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
26 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
27 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
30 ; AVX2-SLOW-LABEL: trunc8i64_8i32:
31 ; AVX2-SLOW: # %bb.0: # %entry
32 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
33 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
34 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
35 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
36 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
37 ; AVX2-SLOW-NEXT: retq
39 ; AVX2-FAST-LABEL: trunc8i64_8i32:
40 ; AVX2-FAST: # %bb.0: # %entry
41 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
42 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0
43 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1
44 ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
45 ; AVX2-FAST-NEXT: retq
47 ; AVX512-LABEL: trunc8i64_8i32:
48 ; AVX512: # %bb.0: # %entry
49 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
52 %0 = trunc <8 x i64> %a to <8 x i32>
56 define <8 x i32> @trunc8i64_8i32_ashr(<8 x i64> %a) {
57 ; SSE-LABEL: trunc8i64_8i32_ashr:
58 ; SSE: # %bb.0: # %entry
59 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
60 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
61 ; SSE-NEXT: movaps %xmm2, %xmm1
64 ; AVX1-LABEL: trunc8i64_8i32_ashr:
65 ; AVX1: # %bb.0: # %entry
66 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
67 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
68 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]
69 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
70 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
73 ; AVX2-SLOW-LABEL: trunc8i64_8i32_ashr:
74 ; AVX2-SLOW: # %bb.0: # %entry
75 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
76 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
77 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,3,2,3,5,7,6,7]
78 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
79 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
80 ; AVX2-SLOW-NEXT: retq
82 ; AVX2-FAST-LABEL: trunc8i64_8i32_ashr:
83 ; AVX2-FAST: # %bb.0: # %entry
84 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,5,7,5,7,6,7]
85 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0
86 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1
87 ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
88 ; AVX2-FAST-NEXT: retq
90 ; AVX512-LABEL: trunc8i64_8i32_ashr:
91 ; AVX512: # %bb.0: # %entry
92 ; AVX512-NEXT: vpsraq $32, %zmm0, %zmm0
93 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
96 %0 = ashr <8 x i64> %a, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
97 %1 = trunc <8 x i64> %0 to <8 x i32>
101 define <8 x i32> @trunc8i64_8i32_lshr(<8 x i64> %a) {
102 ; SSE-LABEL: trunc8i64_8i32_lshr:
103 ; SSE: # %bb.0: # %entry
104 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
105 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
106 ; SSE-NEXT: movaps %xmm2, %xmm1
109 ; AVX1-LABEL: trunc8i64_8i32_lshr:
110 ; AVX1: # %bb.0: # %entry
111 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
112 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
113 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]
114 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
115 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
118 ; AVX2-SLOW-LABEL: trunc8i64_8i32_lshr:
119 ; AVX2-SLOW: # %bb.0: # %entry
120 ; AVX2-SLOW-NEXT: vpsrlq $32, %ymm1, %ymm1
121 ; AVX2-SLOW-NEXT: vpsrlq $32, %ymm0, %ymm0
122 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
123 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
124 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
125 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
126 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
127 ; AVX2-SLOW-NEXT: retq
129 ; AVX2-FAST-LABEL: trunc8i64_8i32_lshr:
130 ; AVX2-FAST: # %bb.0: # %entry
131 ; AVX2-FAST-NEXT: vpsrlq $32, %ymm1, %ymm1
132 ; AVX2-FAST-NEXT: vpsrlq $32, %ymm0, %ymm0
133 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
134 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
135 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
136 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
137 ; AVX2-FAST-NEXT: retq
139 ; AVX512-LABEL: trunc8i64_8i32_lshr:
140 ; AVX512: # %bb.0: # %entry
141 ; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0
142 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
145 %0 = lshr <8 x i64> %a, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
146 %1 = trunc <8 x i64> %0 to <8 x i32>
150 define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) {
151 ; SSE2-LABEL: trunc8i64_8i16:
152 ; SSE2: # %bb.0: # %entry
153 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
154 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
155 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
156 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
157 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
158 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
159 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
160 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
161 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
162 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
163 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
166 ; SSSE3-LABEL: trunc8i64_8i16:
167 ; SSSE3: # %bb.0: # %entry
168 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
169 ; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
170 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
171 ; SSSE3-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
172 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
173 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
174 ; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
175 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
176 ; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
177 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
178 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
181 ; SSE41-LABEL: trunc8i64_8i16:
182 ; SSE41: # %bb.0: # %entry
183 ; SSE41-NEXT: pxor %xmm4, %xmm4
184 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
185 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
186 ; SSE41-NEXT: packusdw %xmm3, %xmm2
187 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
188 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
189 ; SSE41-NEXT: packusdw %xmm1, %xmm0
190 ; SSE41-NEXT: packusdw %xmm2, %xmm0
193 ; AVX1-LABEL: trunc8i64_8i16:
194 ; AVX1: # %bb.0: # %entry
195 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
196 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
197 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
198 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
199 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
200 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
201 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
202 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
203 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
204 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
205 ; AVX1-NEXT: vzeroupper
208 ; AVX2-SLOW-LABEL: trunc8i64_8i16:
209 ; AVX2-SLOW: # %bb.0: # %entry
210 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
211 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
212 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
213 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
214 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
215 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
216 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
217 ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
218 ; AVX2-SLOW-NEXT: vzeroupper
219 ; AVX2-SLOW-NEXT: retq
221 ; AVX2-FAST-LABEL: trunc8i64_8i16:
222 ; AVX2-FAST: # %bb.0: # %entry
223 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
224 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
225 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
226 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
227 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
228 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
229 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
230 ; AVX2-FAST-NEXT: vzeroupper
231 ; AVX2-FAST-NEXT: retq
233 ; AVX512-LABEL: trunc8i64_8i16:
234 ; AVX512: # %bb.0: # %entry
235 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
236 ; AVX512-NEXT: vzeroupper
239 %0 = trunc <8 x i64> %a to <8 x i16>
243 define void @trunc8i64_8i8(<8 x i64> %a) {
244 ; SSE2-LABEL: trunc8i64_8i8:
245 ; SSE2: # %bb.0: # %entry
246 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
247 ; SSE2-NEXT: pand %xmm4, %xmm3
248 ; SSE2-NEXT: pand %xmm4, %xmm2
249 ; SSE2-NEXT: packuswb %xmm3, %xmm2
250 ; SSE2-NEXT: pand %xmm4, %xmm1
251 ; SSE2-NEXT: pand %xmm4, %xmm0
252 ; SSE2-NEXT: packuswb %xmm1, %xmm0
253 ; SSE2-NEXT: packuswb %xmm2, %xmm0
254 ; SSE2-NEXT: packuswb %xmm0, %xmm0
255 ; SSE2-NEXT: movq %xmm0, (%rax)
258 ; SSSE3-LABEL: trunc8i64_8i8:
259 ; SSSE3: # %bb.0: # %entry
260 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
261 ; SSSE3-NEXT: pand %xmm4, %xmm3
262 ; SSSE3-NEXT: pand %xmm4, %xmm2
263 ; SSSE3-NEXT: packuswb %xmm3, %xmm2
264 ; SSSE3-NEXT: pand %xmm4, %xmm1
265 ; SSSE3-NEXT: pand %xmm4, %xmm0
266 ; SSSE3-NEXT: packuswb %xmm1, %xmm0
267 ; SSSE3-NEXT: packuswb %xmm2, %xmm0
268 ; SSSE3-NEXT: packuswb %xmm0, %xmm0
269 ; SSSE3-NEXT: movq %xmm0, (%rax)
272 ; SSE41-LABEL: trunc8i64_8i8:
273 ; SSE41: # %bb.0: # %entry
274 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
275 ; SSE41-NEXT: pand %xmm4, %xmm3
276 ; SSE41-NEXT: pand %xmm4, %xmm2
277 ; SSE41-NEXT: packusdw %xmm3, %xmm2
278 ; SSE41-NEXT: pand %xmm4, %xmm1
279 ; SSE41-NEXT: pand %xmm4, %xmm0
280 ; SSE41-NEXT: packusdw %xmm1, %xmm0
281 ; SSE41-NEXT: packusdw %xmm2, %xmm0
282 ; SSE41-NEXT: packuswb %xmm0, %xmm0
283 ; SSE41-NEXT: movq %xmm0, (%rax)
286 ; AVX1-LABEL: trunc8i64_8i8:
287 ; AVX1: # %bb.0: # %entry
288 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
289 ; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [1.2598673968951787E-321,1.2598673968951787E-321]
290 ; AVX1-NEXT: # xmm3 = mem[0,0]
291 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
292 ; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
293 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
294 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
295 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
296 ; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
297 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
298 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
299 ; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
300 ; AVX1-NEXT: vmovq %xmm0, (%rax)
301 ; AVX1-NEXT: vzeroupper
304 ; AVX2-SLOW-LABEL: trunc8i64_8i8:
305 ; AVX2-SLOW: # %bb.0: # %entry
306 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
307 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
308 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
309 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
310 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
311 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
312 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
313 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
314 ; AVX2-SLOW-NEXT: vmovq %xmm0, (%rax)
315 ; AVX2-SLOW-NEXT: vzeroupper
316 ; AVX2-SLOW-NEXT: retq
318 ; AVX2-FAST-LABEL: trunc8i64_8i8:
319 ; AVX2-FAST: # %bb.0: # %entry
320 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
321 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
322 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
323 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
324 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
325 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
326 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
327 ; AVX2-FAST-NEXT: vmovq %xmm0, (%rax)
328 ; AVX2-FAST-NEXT: vzeroupper
329 ; AVX2-FAST-NEXT: retq
331 ; AVX512-LABEL: trunc8i64_8i8:
332 ; AVX512: # %bb.0: # %entry
333 ; AVX512-NEXT: vpmovqb %zmm0, (%rax)
334 ; AVX512-NEXT: vzeroupper
337 %0 = trunc <8 x i64> %a to <8 x i8>
338 store <8 x i8> %0, <8 x i8>* undef, align 4
342 define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) {
343 ; SSE2-LABEL: trunc8i32_8i16:
344 ; SSE2: # %bb.0: # %entry
345 ; SSE2-NEXT: pslld $16, %xmm1
346 ; SSE2-NEXT: psrad $16, %xmm1
347 ; SSE2-NEXT: pslld $16, %xmm0
348 ; SSE2-NEXT: psrad $16, %xmm0
349 ; SSE2-NEXT: packssdw %xmm1, %xmm0
352 ; SSSE3-LABEL: trunc8i32_8i16:
353 ; SSSE3: # %bb.0: # %entry
354 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
355 ; SSSE3-NEXT: pshufb %xmm2, %xmm1
356 ; SSSE3-NEXT: pshufb %xmm2, %xmm0
357 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
360 ; SSE41-LABEL: trunc8i32_8i16:
361 ; SSE41: # %bb.0: # %entry
362 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
363 ; SSE41-NEXT: pshufb %xmm2, %xmm1
364 ; SSE41-NEXT: pshufb %xmm2, %xmm0
365 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
368 ; AVX1-LABEL: trunc8i32_8i16:
369 ; AVX1: # %bb.0: # %entry
370 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
371 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
372 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
373 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
374 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
375 ; AVX1-NEXT: vzeroupper
378 ; AVX2-LABEL: trunc8i32_8i16:
379 ; AVX2: # %bb.0: # %entry
380 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
381 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
382 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
383 ; AVX2-NEXT: vzeroupper
386 ; AVX512F-LABEL: trunc8i32_8i16:
387 ; AVX512F: # %bb.0: # %entry
388 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
389 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
390 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
391 ; AVX512F-NEXT: vzeroupper
394 ; AVX512VL-LABEL: trunc8i32_8i16:
395 ; AVX512VL: # %bb.0: # %entry
396 ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
397 ; AVX512VL-NEXT: vzeroupper
398 ; AVX512VL-NEXT: retq
400 ; AVX512BW-LABEL: trunc8i32_8i16:
401 ; AVX512BW: # %bb.0: # %entry
402 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
403 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
404 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
405 ; AVX512BW-NEXT: vzeroupper
406 ; AVX512BW-NEXT: retq
408 ; AVX512BWVL-LABEL: trunc8i32_8i16:
409 ; AVX512BWVL: # %bb.0: # %entry
410 ; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0
411 ; AVX512BWVL-NEXT: vzeroupper
412 ; AVX512BWVL-NEXT: retq
414 %0 = trunc <8 x i32> %a to <8 x i16>
418 define <8 x i16> @trunc8i32_8i16_ashr(<8 x i32> %a) {
419 ; SSE-LABEL: trunc8i32_8i16_ashr:
420 ; SSE: # %bb.0: # %entry
421 ; SSE-NEXT: psrad $16, %xmm1
422 ; SSE-NEXT: psrad $16, %xmm0
423 ; SSE-NEXT: packssdw %xmm1, %xmm0
426 ; AVX1-LABEL: trunc8i32_8i16_ashr:
427 ; AVX1: # %bb.0: # %entry
428 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
429 ; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1
430 ; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0
431 ; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
432 ; AVX1-NEXT: vzeroupper
435 ; AVX2-LABEL: trunc8i32_8i16_ashr:
436 ; AVX2: # %bb.0: # %entry
437 ; AVX2-NEXT: vpsrad $16, %ymm0, %ymm0
438 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
439 ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
440 ; AVX2-NEXT: vzeroupper
443 ; AVX512F-LABEL: trunc8i32_8i16_ashr:
444 ; AVX512F: # %bb.0: # %entry
445 ; AVX512F-NEXT: vpsrad $16, %ymm0, %ymm0
446 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
447 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
448 ; AVX512F-NEXT: vzeroupper
451 ; AVX512VL-LABEL: trunc8i32_8i16_ashr:
452 ; AVX512VL: # %bb.0: # %entry
453 ; AVX512VL-NEXT: vpsrad $16, %ymm0, %ymm0
454 ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
455 ; AVX512VL-NEXT: vzeroupper
456 ; AVX512VL-NEXT: retq
458 ; AVX512BW-LABEL: trunc8i32_8i16_ashr:
459 ; AVX512BW: # %bb.0: # %entry
460 ; AVX512BW-NEXT: vpsrad $16, %ymm0, %ymm0
461 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
462 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
463 ; AVX512BW-NEXT: vzeroupper
464 ; AVX512BW-NEXT: retq
466 ; AVX512BWVL-LABEL: trunc8i32_8i16_ashr:
467 ; AVX512BWVL: # %bb.0: # %entry
468 ; AVX512BWVL-NEXT: vpsrad $16, %ymm0, %ymm0
469 ; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0
470 ; AVX512BWVL-NEXT: vzeroupper
471 ; AVX512BWVL-NEXT: retq
473 %0 = ashr <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
474 %1 = trunc <8 x i32> %0 to <8 x i16>
478 define <8 x i16> @trunc8i32_8i16_lshr(<8 x i32> %a) {
479 ; SSE2-LABEL: trunc8i32_8i16_lshr:
480 ; SSE2: # %bb.0: # %entry
481 ; SSE2-NEXT: psrad $16, %xmm1
482 ; SSE2-NEXT: psrad $16, %xmm0
483 ; SSE2-NEXT: packssdw %xmm1, %xmm0
486 ; SSSE3-LABEL: trunc8i32_8i16_lshr:
487 ; SSSE3: # %bb.0: # %entry
488 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,10,11,14,15,14,15,255,255]
489 ; SSSE3-NEXT: pshufb %xmm2, %xmm1
490 ; SSSE3-NEXT: pshufb %xmm2, %xmm0
491 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
494 ; SSE41-LABEL: trunc8i32_8i16_lshr:
495 ; SSE41: # %bb.0: # %entry
496 ; SSE41-NEXT: psrld $16, %xmm1
497 ; SSE41-NEXT: psrld $16, %xmm0
498 ; SSE41-NEXT: packusdw %xmm1, %xmm0
501 ; AVX1-LABEL: trunc8i32_8i16_lshr:
502 ; AVX1: # %bb.0: # %entry
503 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
504 ; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1
505 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
506 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
507 ; AVX1-NEXT: vzeroupper
510 ; AVX2-LABEL: trunc8i32_8i16_lshr:
511 ; AVX2: # %bb.0: # %entry
512 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
513 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
514 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
515 ; AVX2-NEXT: vzeroupper
518 ; AVX512F-LABEL: trunc8i32_8i16_lshr:
519 ; AVX512F: # %bb.0: # %entry
520 ; AVX512F-NEXT: vpsrld $16, %ymm0, %ymm0
521 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
522 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
523 ; AVX512F-NEXT: vzeroupper
526 ; AVX512VL-LABEL: trunc8i32_8i16_lshr:
527 ; AVX512VL: # %bb.0: # %entry
528 ; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0
529 ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
530 ; AVX512VL-NEXT: vzeroupper
531 ; AVX512VL-NEXT: retq
533 ; AVX512BW-LABEL: trunc8i32_8i16_lshr:
534 ; AVX512BW: # %bb.0: # %entry
535 ; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm0
536 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
537 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
538 ; AVX512BW-NEXT: vzeroupper
539 ; AVX512BW-NEXT: retq
541 ; AVX512BWVL-LABEL: trunc8i32_8i16_lshr:
542 ; AVX512BWVL: # %bb.0: # %entry
543 ; AVX512BWVL-NEXT: vpsrld $16, %ymm0, %ymm0
544 ; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0
545 ; AVX512BWVL-NEXT: vzeroupper
546 ; AVX512BWVL-NEXT: retq
548 %0 = lshr <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
549 %1 = trunc <8 x i32> %0 to <8 x i16>
553 define void @trunc8i32_8i8(<8 x i32> %a) {
554 ; SSE2-LABEL: trunc8i32_8i8:
555 ; SSE2: # %bb.0: # %entry
556 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
557 ; SSE2-NEXT: pand %xmm2, %xmm1
558 ; SSE2-NEXT: pand %xmm2, %xmm0
559 ; SSE2-NEXT: packuswb %xmm1, %xmm0
560 ; SSE2-NEXT: packuswb %xmm0, %xmm0
561 ; SSE2-NEXT: movq %xmm0, (%rax)
564 ; SSSE3-LABEL: trunc8i32_8i8:
565 ; SSSE3: # %bb.0: # %entry
566 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
567 ; SSSE3-NEXT: pshufb %xmm2, %xmm1
568 ; SSSE3-NEXT: pshufb %xmm2, %xmm0
569 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
570 ; SSSE3-NEXT: movq %xmm0, (%rax)
573 ; SSE41-LABEL: trunc8i32_8i8:
574 ; SSE41: # %bb.0: # %entry
575 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
576 ; SSE41-NEXT: pshufb %xmm2, %xmm1
577 ; SSE41-NEXT: pshufb %xmm2, %xmm0
578 ; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
579 ; SSE41-NEXT: movq %xmm0, (%rax)
582 ; AVX1-LABEL: trunc8i32_8i8:
583 ; AVX1: # %bb.0: # %entry
584 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
585 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
586 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
587 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
588 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
589 ; AVX1-NEXT: vmovq %xmm0, (%rax)
590 ; AVX1-NEXT: vzeroupper
593 ; AVX2-LABEL: trunc8i32_8i8:
594 ; AVX2: # %bb.0: # %entry
595 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
596 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
597 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
598 ; AVX2-NEXT: vmovq %xmm0, (%rax)
599 ; AVX2-NEXT: vzeroupper
602 ; AVX512F-LABEL: trunc8i32_8i8:
603 ; AVX512F: # %bb.0: # %entry
604 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
605 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
606 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
607 ; AVX512F-NEXT: vmovq %xmm0, (%rax)
608 ; AVX512F-NEXT: vzeroupper
611 ; AVX512VL-LABEL: trunc8i32_8i8:
612 ; AVX512VL: # %bb.0: # %entry
613 ; AVX512VL-NEXT: vpmovdb %ymm0, (%rax)
614 ; AVX512VL-NEXT: vzeroupper
615 ; AVX512VL-NEXT: retq
617 ; AVX512BW-LABEL: trunc8i32_8i8:
618 ; AVX512BW: # %bb.0: # %entry
619 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
620 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
621 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
622 ; AVX512BW-NEXT: vmovq %xmm0, (%rax)
623 ; AVX512BW-NEXT: vzeroupper
624 ; AVX512BW-NEXT: retq
626 ; AVX512BWVL-LABEL: trunc8i32_8i8:
627 ; AVX512BWVL: # %bb.0: # %entry
628 ; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rax)
629 ; AVX512BWVL-NEXT: vzeroupper
630 ; AVX512BWVL-NEXT: retq
632 %0 = trunc <8 x i32> %a to <8 x i8>
633 store <8 x i8> %0, <8 x i8>* undef, align 4
637 define void @trunc16i32_16i16(<16 x i32> %a) {
638 ; SSE2-LABEL: trunc16i32_16i16:
639 ; SSE2: # %bb.0: # %entry
640 ; SSE2-NEXT: pslld $16, %xmm1
641 ; SSE2-NEXT: psrad $16, %xmm1
642 ; SSE2-NEXT: pslld $16, %xmm0
643 ; SSE2-NEXT: psrad $16, %xmm0
644 ; SSE2-NEXT: packssdw %xmm1, %xmm0
645 ; SSE2-NEXT: pslld $16, %xmm3
646 ; SSE2-NEXT: psrad $16, %xmm3
647 ; SSE2-NEXT: pslld $16, %xmm2
648 ; SSE2-NEXT: psrad $16, %xmm2
649 ; SSE2-NEXT: packssdw %xmm3, %xmm2
650 ; SSE2-NEXT: movdqu %xmm2, (%rax)
651 ; SSE2-NEXT: movdqu %xmm0, (%rax)
654 ; SSSE3-LABEL: trunc16i32_16i16:
655 ; SSSE3: # %bb.0: # %entry
656 ; SSSE3-NEXT: pslld $16, %xmm1
657 ; SSSE3-NEXT: psrad $16, %xmm1
658 ; SSSE3-NEXT: pslld $16, %xmm0
659 ; SSSE3-NEXT: psrad $16, %xmm0
660 ; SSSE3-NEXT: packssdw %xmm1, %xmm0
661 ; SSSE3-NEXT: pslld $16, %xmm3
662 ; SSSE3-NEXT: psrad $16, %xmm3
663 ; SSSE3-NEXT: pslld $16, %xmm2
664 ; SSSE3-NEXT: psrad $16, %xmm2
665 ; SSSE3-NEXT: packssdw %xmm3, %xmm2
666 ; SSSE3-NEXT: movdqu %xmm2, (%rax)
667 ; SSSE3-NEXT: movdqu %xmm0, (%rax)
670 ; SSE41-LABEL: trunc16i32_16i16:
671 ; SSE41: # %bb.0: # %entry
672 ; SSE41-NEXT: pxor %xmm4, %xmm4
673 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
674 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
675 ; SSE41-NEXT: packusdw %xmm1, %xmm0
676 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7]
677 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
678 ; SSE41-NEXT: packusdw %xmm3, %xmm2
679 ; SSE41-NEXT: movdqu %xmm2, (%rax)
680 ; SSE41-NEXT: movdqu %xmm0, (%rax)
683 ; AVX1-LABEL: trunc16i32_16i16:
684 ; AVX1: # %bb.0: # %entry
685 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
686 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
687 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7]
688 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7]
689 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
690 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
691 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7]
692 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
693 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
694 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
695 ; AVX1-NEXT: vmovups %ymm0, (%rax)
696 ; AVX1-NEXT: vzeroupper
699 ; AVX2-LABEL: trunc16i32_16i16:
700 ; AVX2: # %bb.0: # %entry
701 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
702 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
703 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
704 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
705 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
706 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
707 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
708 ; AVX2-NEXT: vzeroupper
711 ; AVX512-LABEL: trunc16i32_16i16:
712 ; AVX512: # %bb.0: # %entry
713 ; AVX512-NEXT: vpmovdw %zmm0, (%rax)
714 ; AVX512-NEXT: vzeroupper
717 %0 = trunc <16 x i32> %a to <16 x i16>
718 store <16 x i16> %0, <16 x i16>* undef, align 4
722 define void @trunc16i32_16i16_ashr(<16 x i32> %a) {
723 ; SSE-LABEL: trunc16i32_16i16_ashr:
724 ; SSE: # %bb.0: # %entry
725 ; SSE-NEXT: psrad $16, %xmm3
726 ; SSE-NEXT: psrad $16, %xmm2
727 ; SSE-NEXT: packssdw %xmm3, %xmm2
728 ; SSE-NEXT: psrad $16, %xmm1
729 ; SSE-NEXT: psrad $16, %xmm0
730 ; SSE-NEXT: packssdw %xmm1, %xmm0
731 ; SSE-NEXT: movdqu %xmm2, (%rax)
732 ; SSE-NEXT: movdqu %xmm0, (%rax)
735 ; AVX1-LABEL: trunc16i32_16i16_ashr:
736 ; AVX1: # %bb.0: # %entry
737 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
738 ; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2
739 ; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0
740 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
741 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
742 ; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2
743 ; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1
744 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
745 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
746 ; AVX1-NEXT: vmovups %ymm0, (%rax)
747 ; AVX1-NEXT: vzeroupper
750 ; AVX2-LABEL: trunc16i32_16i16_ashr:
751 ; AVX2: # %bb.0: # %entry
752 ; AVX2-NEXT: vpsrad $16, %ymm1, %ymm1
753 ; AVX2-NEXT: vpsrad $16, %ymm0, %ymm0
754 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
755 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
756 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
757 ; AVX2-NEXT: vzeroupper
760 ; AVX512-LABEL: trunc16i32_16i16_ashr:
761 ; AVX512: # %bb.0: # %entry
762 ; AVX512-NEXT: vpsrld $16, %zmm0, %zmm0
763 ; AVX512-NEXT: vpmovdw %zmm0, (%rax)
764 ; AVX512-NEXT: vzeroupper
767 %0 = ashr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
768 %1 = trunc <16 x i32> %0 to <16 x i16>
769 store <16 x i16> %1, <16 x i16>* undef, align 4
773 define void @trunc16i32_16i16_lshr(<16 x i32> %a) {
774 ; SSE2-LABEL: trunc16i32_16i16_lshr:
775 ; SSE2: # %bb.0: # %entry
776 ; SSE2-NEXT: psrad $16, %xmm1
777 ; SSE2-NEXT: psrad $16, %xmm0
778 ; SSE2-NEXT: packssdw %xmm1, %xmm0
779 ; SSE2-NEXT: psrad $16, %xmm3
780 ; SSE2-NEXT: psrad $16, %xmm2
781 ; SSE2-NEXT: packssdw %xmm3, %xmm2
782 ; SSE2-NEXT: movdqu %xmm2, (%rax)
783 ; SSE2-NEXT: movdqu %xmm0, (%rax)
786 ; SSSE3-LABEL: trunc16i32_16i16_lshr:
787 ; SSSE3: # %bb.0: # %entry
788 ; SSSE3-NEXT: psrad $16, %xmm1
789 ; SSSE3-NEXT: psrad $16, %xmm0
790 ; SSSE3-NEXT: packssdw %xmm1, %xmm0
791 ; SSSE3-NEXT: psrad $16, %xmm3
792 ; SSSE3-NEXT: psrad $16, %xmm2
793 ; SSSE3-NEXT: packssdw %xmm3, %xmm2
794 ; SSSE3-NEXT: movdqu %xmm2, (%rax)
795 ; SSSE3-NEXT: movdqu %xmm0, (%rax)
798 ; SSE41-LABEL: trunc16i32_16i16_lshr:
799 ; SSE41: # %bb.0: # %entry
800 ; SSE41-NEXT: psrld $16, %xmm3
801 ; SSE41-NEXT: psrld $16, %xmm2
802 ; SSE41-NEXT: packusdw %xmm3, %xmm2
803 ; SSE41-NEXT: psrld $16, %xmm1
804 ; SSE41-NEXT: psrld $16, %xmm0
805 ; SSE41-NEXT: packusdw %xmm1, %xmm0
806 ; SSE41-NEXT: movdqu %xmm2, (%rax)
807 ; SSE41-NEXT: movdqu %xmm0, (%rax)
810 ; AVX1-LABEL: trunc16i32_16i16_lshr:
811 ; AVX1: # %bb.0: # %entry
812 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
813 ; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2
814 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
815 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
816 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
817 ; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2
818 ; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1
819 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
820 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
821 ; AVX1-NEXT: vmovups %ymm0, (%rax)
822 ; AVX1-NEXT: vzeroupper
825 ; AVX2-LABEL: trunc16i32_16i16_lshr:
826 ; AVX2: # %bb.0: # %entry
827 ; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
828 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
829 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
830 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
831 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
832 ; AVX2-NEXT: vzeroupper
835 ; AVX512-LABEL: trunc16i32_16i16_lshr:
836 ; AVX512: # %bb.0: # %entry
837 ; AVX512-NEXT: vpsrld $16, %zmm0, %zmm0
838 ; AVX512-NEXT: vpmovdw %zmm0, (%rax)
839 ; AVX512-NEXT: vzeroupper
842 %0 = lshr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
843 %1 = trunc <16 x i32> %0 to <16 x i16>
844 store <16 x i16> %1, <16 x i16>* undef, align 4
848 define void @trunc16i32_16i8(<16 x i32> %a) {
849 ; SSE2-LABEL: trunc16i32_16i8:
850 ; SSE2: # %bb.0: # %entry
851 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
852 ; SSE2-NEXT: pand %xmm4, %xmm3
853 ; SSE2-NEXT: pand %xmm4, %xmm2
854 ; SSE2-NEXT: packuswb %xmm3, %xmm2
855 ; SSE2-NEXT: pand %xmm4, %xmm1
856 ; SSE2-NEXT: pand %xmm4, %xmm0
857 ; SSE2-NEXT: packuswb %xmm1, %xmm0
858 ; SSE2-NEXT: packuswb %xmm2, %xmm0
859 ; SSE2-NEXT: movdqu %xmm0, (%rax)
862 ; SSSE3-LABEL: trunc16i32_16i8:
863 ; SSSE3: # %bb.0: # %entry
864 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
865 ; SSSE3-NEXT: pand %xmm4, %xmm3
866 ; SSSE3-NEXT: pand %xmm4, %xmm2
867 ; SSSE3-NEXT: packuswb %xmm3, %xmm2
868 ; SSSE3-NEXT: pand %xmm4, %xmm1
869 ; SSSE3-NEXT: pand %xmm4, %xmm0
870 ; SSSE3-NEXT: packuswb %xmm1, %xmm0
871 ; SSSE3-NEXT: packuswb %xmm2, %xmm0
872 ; SSSE3-NEXT: movdqu %xmm0, (%rax)
875 ; SSE41-LABEL: trunc16i32_16i8:
876 ; SSE41: # %bb.0: # %entry
877 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
878 ; SSE41-NEXT: pand %xmm4, %xmm3
879 ; SSE41-NEXT: pand %xmm4, %xmm2
880 ; SSE41-NEXT: packusdw %xmm3, %xmm2
881 ; SSE41-NEXT: pand %xmm4, %xmm1
882 ; SSE41-NEXT: pand %xmm4, %xmm0
883 ; SSE41-NEXT: packusdw %xmm1, %xmm0
884 ; SSE41-NEXT: packuswb %xmm2, %xmm0
885 ; SSE41-NEXT: movdqu %xmm0, (%rax)
888 ; AVX1-LABEL: trunc16i32_16i8:
889 ; AVX1: # %bb.0: # %entry
890 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
891 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
892 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
893 ; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
894 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
895 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
896 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
897 ; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
898 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
899 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
900 ; AVX1-NEXT: vmovdqu %xmm0, (%rax)
901 ; AVX1-NEXT: vzeroupper
904 ; AVX2-LABEL: trunc16i32_16i8:
905 ; AVX2: # %bb.0: # %entry
906 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
907 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
908 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
909 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
910 ; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
911 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
912 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
913 ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
914 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
915 ; AVX2-NEXT: vmovdqu %xmm0, (%rax)
916 ; AVX2-NEXT: vzeroupper
919 ; AVX512-LABEL: trunc16i32_16i8:
920 ; AVX512: # %bb.0: # %entry
921 ; AVX512-NEXT: vpmovdb %zmm0, (%rax)
922 ; AVX512-NEXT: vzeroupper
925 %0 = trunc <16 x i32> %a to <16 x i8>
926 store <16 x i8> %0, <16 x i8>* undef, align 4
930 define void @trunc16i32_16i8_ashr(<16 x i32> %a) {
931 ; SSE-LABEL: trunc16i32_16i8_ashr:
932 ; SSE: # %bb.0: # %entry
933 ; SSE-NEXT: psrad $24, %xmm1
934 ; SSE-NEXT: psrad $24, %xmm0
935 ; SSE-NEXT: packssdw %xmm1, %xmm0
936 ; SSE-NEXT: psrad $24, %xmm3
937 ; SSE-NEXT: psrad $24, %xmm2
938 ; SSE-NEXT: packssdw %xmm3, %xmm2
939 ; SSE-NEXT: packsswb %xmm2, %xmm0
940 ; SSE-NEXT: movdqu %xmm0, (%rax)
943 ; AVX1-LABEL: trunc16i32_16i8_ashr:
944 ; AVX1: # %bb.0: # %entry
945 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
946 ; AVX1-NEXT: vpsrad $24, %xmm2, %xmm2
947 ; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0
948 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
949 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
950 ; AVX1-NEXT: vpsrad $24, %xmm2, %xmm2
951 ; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1
952 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
953 ; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
954 ; AVX1-NEXT: vmovdqu %xmm0, (%rax)
955 ; AVX1-NEXT: vzeroupper
958 ; AVX2-LABEL: trunc16i32_16i8_ashr:
959 ; AVX2: # %bb.0: # %entry
960 ; AVX2-NEXT: vpsrad $24, %ymm1, %ymm1
961 ; AVX2-NEXT: vpsrad $24, %ymm0, %ymm0
962 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
963 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
964 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
965 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
966 ; AVX2-NEXT: vmovdqu %xmm0, (%rax)
967 ; AVX2-NEXT: vzeroupper
970 ; AVX512-LABEL: trunc16i32_16i8_ashr:
971 ; AVX512: # %bb.0: # %entry
972 ; AVX512-NEXT: vpsrld $24, %zmm0, %zmm0
973 ; AVX512-NEXT: vpmovdb %zmm0, (%rax)
974 ; AVX512-NEXT: vzeroupper
977 %0 = ashr <16 x i32> %a, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
978 %1 = trunc <16 x i32> %0 to <16 x i8>
979 store <16 x i8> %1, <16 x i8>* undef, align 4
983 define void @trunc16i32_16i8_lshr(<16 x i32> %a) {
984 ; SSE2-LABEL: trunc16i32_16i8_lshr:
985 ; SSE2: # %bb.0: # %entry
986 ; SSE2-NEXT: psrld $24, %xmm1
987 ; SSE2-NEXT: psrld $24, %xmm0
988 ; SSE2-NEXT: packuswb %xmm1, %xmm0
989 ; SSE2-NEXT: psrld $24, %xmm3
990 ; SSE2-NEXT: psrld $24, %xmm2
991 ; SSE2-NEXT: packuswb %xmm3, %xmm2
992 ; SSE2-NEXT: packuswb %xmm2, %xmm0
993 ; SSE2-NEXT: movdqu %xmm0, (%rax)
996 ; SSSE3-LABEL: trunc16i32_16i8_lshr:
997 ; SSSE3: # %bb.0: # %entry
998 ; SSSE3-NEXT: psrld $24, %xmm1
999 ; SSSE3-NEXT: psrld $24, %xmm0
1000 ; SSSE3-NEXT: packuswb %xmm1, %xmm0
1001 ; SSSE3-NEXT: psrld $24, %xmm3
1002 ; SSSE3-NEXT: psrld $24, %xmm2
1003 ; SSSE3-NEXT: packuswb %xmm3, %xmm2
1004 ; SSSE3-NEXT: packuswb %xmm2, %xmm0
1005 ; SSSE3-NEXT: movdqu %xmm0, (%rax)
1008 ; SSE41-LABEL: trunc16i32_16i8_lshr:
1009 ; SSE41: # %bb.0: # %entry
1010 ; SSE41-NEXT: psrld $24, %xmm1
1011 ; SSE41-NEXT: psrld $24, %xmm0
1012 ; SSE41-NEXT: packusdw %xmm1, %xmm0
1013 ; SSE41-NEXT: psrld $24, %xmm3
1014 ; SSE41-NEXT: psrld $24, %xmm2
1015 ; SSE41-NEXT: packusdw %xmm3, %xmm2
1016 ; SSE41-NEXT: packuswb %xmm2, %xmm0
1017 ; SSE41-NEXT: movdqu %xmm0, (%rax)
1020 ; AVX1-LABEL: trunc16i32_16i8_lshr:
1021 ; AVX1: # %bb.0: # %entry
1022 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1023 ; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2
1024 ; AVX1-NEXT: vpsrld $24, %xmm0, %xmm0
1025 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
1026 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1027 ; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2
1028 ; AVX1-NEXT: vpsrld $24, %xmm1, %xmm1
1029 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
1030 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1031 ; AVX1-NEXT: vmovdqu %xmm0, (%rax)
1032 ; AVX1-NEXT: vzeroupper
1035 ; AVX2-LABEL: trunc16i32_16i8_lshr:
1036 ; AVX2: # %bb.0: # %entry
1037 ; AVX2-NEXT: vpsrld $24, %ymm1, %ymm1
1038 ; AVX2-NEXT: vpsrld $24, %ymm0, %ymm0
1039 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
1040 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1041 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1042 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1043 ; AVX2-NEXT: vmovdqu %xmm0, (%rax)
1044 ; AVX2-NEXT: vzeroupper
1047 ; AVX512-LABEL: trunc16i32_16i8_lshr:
1048 ; AVX512: # %bb.0: # %entry
1049 ; AVX512-NEXT: vpsrld $24, %zmm0, %zmm0
1050 ; AVX512-NEXT: vpmovdb %zmm0, (%rax)
1051 ; AVX512-NEXT: vzeroupper
1054 %0 = lshr <16 x i32> %a, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
1055 %1 = trunc <16 x i32> %0 to <16 x i8>
1056 store <16 x i8> %1, <16 x i8>* undef, align 4
1061 define void @trunc16i16_16i8(<16 x i16> %a) {
1062 ; SSE2-LABEL: trunc16i16_16i8:
1063 ; SSE2: # %bb.0: # %entry
1064 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1065 ; SSE2-NEXT: pand %xmm2, %xmm1
1066 ; SSE2-NEXT: pand %xmm2, %xmm0
1067 ; SSE2-NEXT: packuswb %xmm1, %xmm0
1068 ; SSE2-NEXT: movdqu %xmm0, (%rax)
1071 ; SSSE3-LABEL: trunc16i16_16i8:
1072 ; SSSE3: # %bb.0: # %entry
1073 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1074 ; SSSE3-NEXT: pshufb %xmm2, %xmm1
1075 ; SSSE3-NEXT: pshufb %xmm2, %xmm0
1076 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1077 ; SSSE3-NEXT: movdqu %xmm0, (%rax)
1080 ; SSE41-LABEL: trunc16i16_16i8:
1081 ; SSE41: # %bb.0: # %entry
1082 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1083 ; SSE41-NEXT: pshufb %xmm2, %xmm1
1084 ; SSE41-NEXT: pshufb %xmm2, %xmm0
1085 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1086 ; SSE41-NEXT: movdqu %xmm0, (%rax)
1089 ; AVX1-LABEL: trunc16i16_16i8:
1090 ; AVX1: # %bb.0: # %entry
1091 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
1092 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1093 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1094 ; AVX1-NEXT: vmovdqu %xmm0, (%rax)
1095 ; AVX1-NEXT: vzeroupper
1098 ; AVX2-LABEL: trunc16i16_16i8:
1099 ; AVX2: # %bb.0: # %entry
1100 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1101 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1102 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1103 ; AVX2-NEXT: vmovdqu %xmm0, (%rax)
1104 ; AVX2-NEXT: vzeroupper
1107 ; AVX512F-LABEL: trunc16i16_16i8:
1108 ; AVX512F: # %bb.0: # %entry
1109 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1110 ; AVX512F-NEXT: vpmovdb %zmm0, (%rax)
1111 ; AVX512F-NEXT: vzeroupper
1112 ; AVX512F-NEXT: retq
1114 ; AVX512VL-LABEL: trunc16i16_16i8:
1115 ; AVX512VL: # %bb.0: # %entry
1116 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1117 ; AVX512VL-NEXT: vpmovdb %zmm0, (%rax)
1118 ; AVX512VL-NEXT: vzeroupper
1119 ; AVX512VL-NEXT: retq
1121 ; AVX512BW-LABEL: trunc16i16_16i8:
1122 ; AVX512BW: # %bb.0: # %entry
1123 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1124 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1125 ; AVX512BW-NEXT: vmovdqu %xmm0, (%rax)
1126 ; AVX512BW-NEXT: vzeroupper
1127 ; AVX512BW-NEXT: retq
1129 ; AVX512BWVL-LABEL: trunc16i16_16i8:
1130 ; AVX512BWVL: # %bb.0: # %entry
1131 ; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rax)
1132 ; AVX512BWVL-NEXT: vzeroupper
1133 ; AVX512BWVL-NEXT: retq
1135 %0 = trunc <16 x i16> %a to <16 x i8>
1136 store <16 x i8> %0, <16 x i8>* undef, align 4
1140 define void @trunc16i16_16i8_ashr(<16 x i16> %a) {
1141 ; SSE-LABEL: trunc16i16_16i8_ashr:
1142 ; SSE: # %bb.0: # %entry
1143 ; SSE-NEXT: psraw $8, %xmm1
1144 ; SSE-NEXT: psraw $8, %xmm0
1145 ; SSE-NEXT: packsswb %xmm1, %xmm0
1146 ; SSE-NEXT: movdqu %xmm0, (%rax)
1149 ; AVX1-LABEL: trunc16i16_16i8_ashr:
1150 ; AVX1: # %bb.0: # %entry
1151 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1152 ; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
1153 ; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0
1154 ; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
1155 ; AVX1-NEXT: vmovdqu %xmm0, (%rax)
1156 ; AVX1-NEXT: vzeroupper
1159 ; AVX2-LABEL: trunc16i16_16i8_ashr:
1160 ; AVX2: # %bb.0: # %entry
1161 ; AVX2-NEXT: vpsraw $8, %ymm0, %ymm0
1162 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1163 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
1164 ; AVX2-NEXT: vmovdqu %xmm0, (%rax)
1165 ; AVX2-NEXT: vzeroupper
1168 ; AVX512F-LABEL: trunc16i16_16i8_ashr:
1169 ; AVX512F: # %bb.0: # %entry
1170 ; AVX512F-NEXT: vpsraw $8, %ymm0, %ymm0
1171 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1172 ; AVX512F-NEXT: vpmovdb %zmm0, (%rax)
1173 ; AVX512F-NEXT: vzeroupper
1174 ; AVX512F-NEXT: retq
1176 ; AVX512VL-LABEL: trunc16i16_16i8_ashr:
1177 ; AVX512VL: # %bb.0: # %entry
1178 ; AVX512VL-NEXT: vpsraw $8, %ymm0, %ymm0
1179 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1180 ; AVX512VL-NEXT: vpmovdb %zmm0, (%rax)
1181 ; AVX512VL-NEXT: vzeroupper
1182 ; AVX512VL-NEXT: retq
1184 ; AVX512BW-LABEL: trunc16i16_16i8_ashr:
1185 ; AVX512BW: # %bb.0: # %entry
1186 ; AVX512BW-NEXT: vpsraw $8, %ymm0, %ymm0
1187 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1188 ; AVX512BW-NEXT: vmovdqu %xmm0, (%rax)
1189 ; AVX512BW-NEXT: vzeroupper
1190 ; AVX512BW-NEXT: retq
1192 ; AVX512BWVL-LABEL: trunc16i16_16i8_ashr:
1193 ; AVX512BWVL: # %bb.0: # %entry
1194 ; AVX512BWVL-NEXT: vpsrlw $8, %ymm0, %ymm0
1195 ; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rax)
1196 ; AVX512BWVL-NEXT: vzeroupper
1197 ; AVX512BWVL-NEXT: retq
1199 %0 = ashr <16 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1200 %1 = trunc <16 x i16> %0 to <16 x i8>
1201 store <16 x i8> %1, <16 x i8>* undef, align 4
1205 define void @trunc16i16_16i8_lshr(<16 x i16> %a) {
1206 ; SSE-LABEL: trunc16i16_16i8_lshr:
1207 ; SSE: # %bb.0: # %entry
1208 ; SSE-NEXT: psrlw $8, %xmm1
1209 ; SSE-NEXT: psrlw $8, %xmm0
1210 ; SSE-NEXT: packuswb %xmm1, %xmm0
1211 ; SSE-NEXT: movdqu %xmm0, (%rax)
1214 ; AVX1-LABEL: trunc16i16_16i8_lshr:
1215 ; AVX1: # %bb.0: # %entry
1216 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1217 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
1218 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
1219 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1220 ; AVX1-NEXT: vmovdqu %xmm0, (%rax)
1221 ; AVX1-NEXT: vzeroupper
1224 ; AVX2-LABEL: trunc16i16_16i8_lshr:
1225 ; AVX2: # %bb.0: # %entry
1226 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
1227 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1228 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1229 ; AVX2-NEXT: vmovdqu %xmm0, (%rax)
1230 ; AVX2-NEXT: vzeroupper
1233 ; AVX512F-LABEL: trunc16i16_16i8_lshr:
1234 ; AVX512F: # %bb.0: # %entry
1235 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
1236 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1237 ; AVX512F-NEXT: vpmovdb %zmm0, (%rax)
1238 ; AVX512F-NEXT: vzeroupper
1239 ; AVX512F-NEXT: retq
1241 ; AVX512VL-LABEL: trunc16i16_16i8_lshr:
1242 ; AVX512VL: # %bb.0: # %entry
1243 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
1244 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1245 ; AVX512VL-NEXT: vpmovdb %zmm0, (%rax)
1246 ; AVX512VL-NEXT: vzeroupper
1247 ; AVX512VL-NEXT: retq
1249 ; AVX512BW-LABEL: trunc16i16_16i8_lshr:
1250 ; AVX512BW: # %bb.0: # %entry
1251 ; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0
1252 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1253 ; AVX512BW-NEXT: vmovdqu %xmm0, (%rax)
1254 ; AVX512BW-NEXT: vzeroupper
1255 ; AVX512BW-NEXT: retq
1257 ; AVX512BWVL-LABEL: trunc16i16_16i8_lshr:
1258 ; AVX512BWVL: # %bb.0: # %entry
1259 ; AVX512BWVL-NEXT: vpsrlw $8, %ymm0, %ymm0
1260 ; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rax)
1261 ; AVX512BWVL-NEXT: vzeroupper
1262 ; AVX512BWVL-NEXT: retq
1264 %0 = lshr <16 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1265 %1 = trunc <16 x i16> %0 to <16 x i8>
1266 store <16 x i8> %1, <16 x i8>* undef, align 4
1270 define void @trunc32i16_32i8(<32 x i16> %a) {
1271 ; SSE2-LABEL: trunc32i16_32i8:
1272 ; SSE2: # %bb.0: # %entry
1273 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1274 ; SSE2-NEXT: pand %xmm4, %xmm1
1275 ; SSE2-NEXT: pand %xmm4, %xmm0
1276 ; SSE2-NEXT: packuswb %xmm1, %xmm0
1277 ; SSE2-NEXT: pand %xmm4, %xmm3
1278 ; SSE2-NEXT: pand %xmm4, %xmm2
1279 ; SSE2-NEXT: packuswb %xmm3, %xmm2
1280 ; SSE2-NEXT: movdqu %xmm2, (%rax)
1281 ; SSE2-NEXT: movdqu %xmm0, (%rax)
1284 ; SSSE3-LABEL: trunc32i16_32i8:
1285 ; SSSE3: # %bb.0: # %entry
1286 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1287 ; SSSE3-NEXT: pshufb %xmm4, %xmm1
1288 ; SSSE3-NEXT: pshufb %xmm4, %xmm0
1289 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1290 ; SSSE3-NEXT: pshufb %xmm4, %xmm3
1291 ; SSSE3-NEXT: pshufb %xmm4, %xmm2
1292 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1293 ; SSSE3-NEXT: movdqu %xmm2, (%rax)
1294 ; SSSE3-NEXT: movdqu %xmm0, (%rax)
1297 ; SSE41-LABEL: trunc32i16_32i8:
1298 ; SSE41: # %bb.0: # %entry
1299 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1300 ; SSE41-NEXT: pshufb %xmm4, %xmm1
1301 ; SSE41-NEXT: pshufb %xmm4, %xmm0
1302 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1303 ; SSE41-NEXT: pshufb %xmm4, %xmm3
1304 ; SSE41-NEXT: pshufb %xmm4, %xmm2
1305 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1306 ; SSE41-NEXT: movdqu %xmm2, (%rax)
1307 ; SSE41-NEXT: movdqu %xmm0, (%rax)
1310 ; AVX1-LABEL: trunc32i16_32i8:
1311 ; AVX1: # %bb.0: # %entry
1312 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1313 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
1314 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1315 ; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
1316 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
1317 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1318 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1319 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1320 ; AVX1-NEXT: vmovups %ymm0, (%rax)
1321 ; AVX1-NEXT: vzeroupper
1324 ; AVX2-LABEL: trunc32i16_32i8:
1325 ; AVX2: # %bb.0: # %entry
1326 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1327 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
1328 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
1329 ; AVX2-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
1330 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
1331 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
1332 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1333 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1334 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
1335 ; AVX2-NEXT: vzeroupper
1338 ; AVX512F-LABEL: trunc32i16_32i8:
1339 ; AVX512F: # %bb.0: # %entry
1340 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1341 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
1342 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1343 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
1344 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1345 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
1346 ; AVX512F-NEXT: vzeroupper
1347 ; AVX512F-NEXT: retq
1349 ; AVX512VL-LABEL: trunc32i16_32i8:
1350 ; AVX512VL: # %bb.0: # %entry
1351 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1352 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
1353 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1354 ; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1
1355 ; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1356 ; AVX512VL-NEXT: vmovdqu %ymm0, (%rax)
1357 ; AVX512VL-NEXT: vzeroupper
1358 ; AVX512VL-NEXT: retq
1360 ; AVX512BW-LABEL: trunc32i16_32i8:
1361 ; AVX512BW: # %bb.0: # %entry
1362 ; AVX512BW-NEXT: vpmovwb %zmm0, (%rax)
1363 ; AVX512BW-NEXT: vzeroupper
1364 ; AVX512BW-NEXT: retq
1366 ; AVX512BWVL-LABEL: trunc32i16_32i8:
1367 ; AVX512BWVL: # %bb.0: # %entry
1368 ; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rax)
1369 ; AVX512BWVL-NEXT: vzeroupper
1370 ; AVX512BWVL-NEXT: retq
1372 %0 = trunc <32 x i16> %a to <32 x i8>
1373 store <32 x i8> %0, <32 x i8>* undef, align 4
1377 define <8 x i32> @trunc2x4i64_8i32(<4 x i64> %a, <4 x i64> %b) {
1378 ; SSE-LABEL: trunc2x4i64_8i32:
1379 ; SSE: # %bb.0: # %entry
1380 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1381 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
1382 ; SSE-NEXT: movaps %xmm2, %xmm1
1385 ; AVX1-LABEL: trunc2x4i64_8i32:
1386 ; AVX1: # %bb.0: # %entry
1387 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1388 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
1389 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1390 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
1391 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1394 ; AVX2-SLOW-LABEL: trunc2x4i64_8i32:
1395 ; AVX2-SLOW: # %bb.0: # %entry
1396 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
1397 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
1398 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
1399 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
1400 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1401 ; AVX2-SLOW-NEXT: retq
1403 ; AVX2-FAST-LABEL: trunc2x4i64_8i32:
1404 ; AVX2-FAST: # %bb.0: # %entry
1405 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
1406 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0
1407 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1
1408 ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1409 ; AVX2-FAST-NEXT: retq
1411 ; AVX512F-LABEL: trunc2x4i64_8i32:
1412 ; AVX512F: # %bb.0: # %entry
1413 ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1414 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1415 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
1416 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
1417 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1418 ; AVX512F-NEXT: retq
1420 ; AVX512VL-LABEL: trunc2x4i64_8i32:
1421 ; AVX512VL: # %bb.0: # %entry
1422 ; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0
1423 ; AVX512VL-NEXT: vpmovqd %ymm1, %xmm1
1424 ; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1425 ; AVX512VL-NEXT: retq
1427 ; AVX512BW-LABEL: trunc2x4i64_8i32:
1428 ; AVX512BW: # %bb.0: # %entry
1429 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1430 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1431 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
1432 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
1433 ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1434 ; AVX512BW-NEXT: retq
1436 ; AVX512BWVL-LABEL: trunc2x4i64_8i32:
1437 ; AVX512BWVL: # %bb.0: # %entry
1438 ; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0
1439 ; AVX512BWVL-NEXT: vpmovqd %ymm1, %xmm1
1440 ; AVX512BWVL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1441 ; AVX512BWVL-NEXT: retq
1443 %0 = trunc <4 x i64> %a to <4 x i32>
1444 %1 = trunc <4 x i64> %b to <4 x i32>
1445 %2 = shufflevector <4 x i32> %0, <4 x i32> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1449 define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
1450 ; SSE2-LABEL: trunc2x4i64_8i16:
1451 ; SSE2: # %bb.0: # %entry
1452 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1453 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1454 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1455 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
1456 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1457 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
1458 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
1459 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
1460 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
1461 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1462 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
1465 ; SSSE3-LABEL: trunc2x4i64_8i16:
1466 ; SSSE3: # %bb.0: # %entry
1467 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1468 ; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1469 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1470 ; SSSE3-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
1471 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1472 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
1473 ; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
1474 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
1475 ; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
1476 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1477 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
1480 ; SSE41-LABEL: trunc2x4i64_8i16:
1481 ; SSE41: # %bb.0: # %entry
1482 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
1483 ; SSE41-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
1484 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1485 ; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
1486 ; SSE41-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1487 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1488 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1489 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1490 ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1491 ; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1492 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1495 ; AVX1-LABEL: trunc2x4i64_8i16:
1496 ; AVX1: # %bb.0: # %entry
1497 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1498 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
1499 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1500 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
1501 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1502 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1503 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1504 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1505 ; AVX1-NEXT: vzeroupper
1508 ; AVX2-SLOW-LABEL: trunc2x4i64_8i16:
1509 ; AVX2-SLOW: # %bb.0: # %entry
1510 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
1511 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1512 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
1513 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1514 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1515 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1516 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1517 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1518 ; AVX2-SLOW-NEXT: vzeroupper
1519 ; AVX2-SLOW-NEXT: retq
1521 ; AVX2-FAST-LABEL: trunc2x4i64_8i16:
1522 ; AVX2-FAST: # %bb.0: # %entry
1523 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
1524 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
1525 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
1526 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1527 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1528 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1529 ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1530 ; AVX2-FAST-NEXT: vzeroupper
1531 ; AVX2-FAST-NEXT: retq
1533 ; AVX512F-LABEL: trunc2x4i64_8i16:
1534 ; AVX512F: # %bb.0: # %entry
1535 ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1536 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1537 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
1538 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
1539 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1540 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1541 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1542 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1543 ; AVX512F-NEXT: vzeroupper
1544 ; AVX512F-NEXT: retq
1546 ; AVX512VL-LABEL: trunc2x4i64_8i16:
1547 ; AVX512VL: # %bb.0: # %entry
1548 ; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0
1549 ; AVX512VL-NEXT: vpmovqd %ymm1, %xmm1
1550 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1551 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1552 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1553 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1554 ; AVX512VL-NEXT: vzeroupper
1555 ; AVX512VL-NEXT: retq
1557 ; AVX512BW-LABEL: trunc2x4i64_8i16:
1558 ; AVX512BW: # %bb.0: # %entry
1559 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1560 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1561 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
1562 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
1563 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1564 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1565 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1566 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1567 ; AVX512BW-NEXT: vzeroupper
1568 ; AVX512BW-NEXT: retq
1570 ; AVX512BWVL-LABEL: trunc2x4i64_8i16:
1571 ; AVX512BWVL: # %bb.0: # %entry
1572 ; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0
1573 ; AVX512BWVL-NEXT: vpmovqd %ymm1, %xmm1
1574 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1575 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1576 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1577 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1578 ; AVX512BWVL-NEXT: vzeroupper
1579 ; AVX512BWVL-NEXT: retq
1581 %0 = trunc <4 x i64> %a to <4 x i16>
1582 %1 = trunc <4 x i64> %b to <4 x i16>
1583 %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1587 define <4 x i32> @trunc2x2i64_4i32(<2 x i64> %a, <2 x i64> %b) {
1588 ; SSE-LABEL: trunc2x2i64_4i32:
1589 ; SSE: # %bb.0: # %entry
1590 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1593 ; AVX-LABEL: trunc2x2i64_4i32:
1594 ; AVX: # %bb.0: # %entry
1595 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1598 ; AVX512-LABEL: trunc2x2i64_4i32:
1599 ; AVX512: # %bb.0: # %entry
1600 ; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1603 %0 = trunc <2 x i64> %a to <2 x i32>
1604 %1 = trunc <2 x i64> %b to <2 x i32>
1605 %2 = shufflevector <2 x i32> %0, <2 x i32> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1609 define i64 @trunc2i64_i64(<2 x i64> %inval) {
1610 ; SSE-LABEL: trunc2i64_i64:
1611 ; SSE: # %bb.0: # %entry
1612 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1613 ; SSE-NEXT: movq %xmm0, %rax
1616 ; AVX-LABEL: trunc2i64_i64:
1617 ; AVX: # %bb.0: # %entry
1618 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1619 ; AVX-NEXT: vmovq %xmm0, %rax
1622 ; AVX512-LABEL: trunc2i64_i64:
1623 ; AVX512: # %bb.0: # %entry
1624 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1625 ; AVX512-NEXT: vmovq %xmm0, %rax
1628 %0 = trunc <2 x i64> %inval to <2 x i32>
1629 %1 = bitcast <2 x i32> %0 to i64
1633 define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) {
1634 ; SSE2-LABEL: trunc2x4i32_8i16:
1635 ; SSE2: # %bb.0: # %entry
1636 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1637 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
1638 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1639 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1640 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
1641 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1642 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1645 ; SSSE3-LABEL: trunc2x4i32_8i16:
1646 ; SSSE3: # %bb.0: # %entry
1647 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1648 ; SSSE3-NEXT: pshufb %xmm2, %xmm1
1649 ; SSSE3-NEXT: pshufb %xmm2, %xmm0
1650 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1653 ; SSE41-LABEL: trunc2x4i32_8i16:
1654 ; SSE41: # %bb.0: # %entry
1655 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1656 ; SSE41-NEXT: pshufb %xmm2, %xmm1
1657 ; SSE41-NEXT: pshufb %xmm2, %xmm0
1658 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1661 ; AVX-LABEL: trunc2x4i32_8i16:
1662 ; AVX: # %bb.0: # %entry
1663 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1664 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1665 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1666 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1669 ; AVX512-LABEL: trunc2x4i32_8i16:
1670 ; AVX512: # %bb.0: # %entry
1671 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1672 ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1673 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1674 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1677 %0 = trunc <4 x i32> %a to <4 x i16>
1678 %1 = trunc <4 x i32> %b to <4 x i16>
1679 %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1683 ; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524
1684 define i64 @trunc4i32_i64(<4 x i32> %inval) {
1685 ; SSE2-LABEL: trunc4i32_i64:
1686 ; SSE2: # %bb.0: # %entry
1687 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1688 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
1689 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1690 ; SSE2-NEXT: movq %xmm0, %rax
1693 ; SSSE3-LABEL: trunc4i32_i64:
1694 ; SSSE3: # %bb.0: # %entry
1695 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1696 ; SSSE3-NEXT: movq %xmm0, %rax
1699 ; SSE41-LABEL: trunc4i32_i64:
1700 ; SSE41: # %bb.0: # %entry
1701 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1702 ; SSE41-NEXT: movq %xmm0, %rax
1705 ; AVX-LABEL: trunc4i32_i64:
1706 ; AVX: # %bb.0: # %entry
1707 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1708 ; AVX-NEXT: vmovq %xmm0, %rax
1711 ; AVX512-LABEL: trunc4i32_i64:
1712 ; AVX512: # %bb.0: # %entry
1713 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1714 ; AVX512-NEXT: vmovq %xmm0, %rax
1717 %0 = trunc <4 x i32> %inval to <4 x i16>
1718 %1 = bitcast <4 x i16> %0 to i64
1722 define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) {
1723 ; SSE2-LABEL: trunc2x8i16_16i8:
1724 ; SSE2: # %bb.0: # %entry
1725 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1726 ; SSE2-NEXT: pand %xmm2, %xmm1
1727 ; SSE2-NEXT: pand %xmm2, %xmm0
1728 ; SSE2-NEXT: packuswb %xmm1, %xmm0
1731 ; SSSE3-LABEL: trunc2x8i16_16i8:
1732 ; SSSE3: # %bb.0: # %entry
1733 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1734 ; SSSE3-NEXT: pshufb %xmm2, %xmm1
1735 ; SSSE3-NEXT: pshufb %xmm2, %xmm0
1736 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1739 ; SSE41-LABEL: trunc2x8i16_16i8:
1740 ; SSE41: # %bb.0: # %entry
1741 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1742 ; SSE41-NEXT: pshufb %xmm2, %xmm1
1743 ; SSE41-NEXT: pshufb %xmm2, %xmm0
1744 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1747 ; AVX-LABEL: trunc2x8i16_16i8:
1748 ; AVX: # %bb.0: # %entry
1749 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1750 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1751 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1752 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1755 ; AVX512-LABEL: trunc2x8i16_16i8:
1756 ; AVX512: # %bb.0: # %entry
1757 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1758 ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1759 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1760 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1763 %0 = trunc <8 x i16> %a to <8 x i8>
1764 %1 = trunc <8 x i16> %b to <8 x i8>
1765 %2 = shufflevector <8 x i8> %0, <8 x i8> %1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1769 ; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524
1770 define i64 @trunc8i16_i64(<8 x i16> %inval) {
1771 ; SSE2-LABEL: trunc8i16_i64:
1772 ; SSE2: # %bb.0: # %entry
1773 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
1774 ; SSE2-NEXT: packuswb %xmm0, %xmm0
1775 ; SSE2-NEXT: movq %xmm0, %rax
1778 ; SSSE3-LABEL: trunc8i16_i64:
1779 ; SSSE3: # %bb.0: # %entry
1780 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
1781 ; SSSE3-NEXT: movq %xmm0, %rax
1784 ; SSE41-LABEL: trunc8i16_i64:
1785 ; SSE41: # %bb.0: # %entry
1786 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
1787 ; SSE41-NEXT: movq %xmm0, %rax
1790 ; AVX-LABEL: trunc8i16_i64:
1791 ; AVX: # %bb.0: # %entry
1792 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
1793 ; AVX-NEXT: vmovq %xmm0, %rax
1796 ; AVX512-LABEL: trunc8i16_i64:
1797 ; AVX512: # %bb.0: # %entry
1798 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
1799 ; AVX512-NEXT: vmovq %xmm0, %rax
1802 %0 = trunc <8 x i16> %inval to <8 x i8>
1803 %1 = bitcast <8 x i8> %0 to i64
1807 define <16 x i8> @trunc16i64_16i8_const() {
1808 ; SSE-LABEL: trunc16i64_16i8_const:
1809 ; SSE: # %bb.0: # %entry
1810 ; SSE-NEXT: xorps %xmm0, %xmm0
1813 ; AVX-LABEL: trunc16i64_16i8_const:
1814 ; AVX: # %bb.0: # %entry
1815 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
1818 ; AVX512-LABEL: trunc16i64_16i8_const:
1819 ; AVX512: # %bb.0: # %entry
1820 ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
1824 %0 = trunc <16 x i64> zeroinitializer to <16 x i8>
1825 %1 = shufflevector <16 x i8> %0, <16 x i8> %0, <16 x i32> <i32 28, i32 30, i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 undef, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26>
1829 define <8 x i16> @PR32160(<8 x i32> %x) {
1830 ; SSE-LABEL: PR32160:
1832 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
1833 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
1836 ; AVX1-LABEL: PR32160:
1838 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9]
1839 ; AVX1-NEXT: vzeroupper
1842 ; AVX2-SLOW-LABEL: PR32160:
1843 ; AVX2-SLOW: # %bb.0:
1844 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1845 ; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0
1846 ; AVX2-SLOW-NEXT: vzeroupper
1847 ; AVX2-SLOW-NEXT: retq
1849 ; AVX2-FAST-LABEL: PR32160:
1850 ; AVX2-FAST: # %bb.0:
1851 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9]
1852 ; AVX2-FAST-NEXT: vzeroupper
1853 ; AVX2-FAST-NEXT: retq
1855 ; AVX512F-LABEL: PR32160:
1857 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1858 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
1859 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7]
1860 ; AVX512F-NEXT: vpbroadcastd %xmm0, %xmm0
1861 ; AVX512F-NEXT: vzeroupper
1862 ; AVX512F-NEXT: retq
1864 ; AVX512VL-LABEL: PR32160:
1865 ; AVX512VL: # %bb.0:
1866 ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
1867 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
1868 ; AVX512VL-NEXT: vzeroupper
1869 ; AVX512VL-NEXT: retq
1871 ; AVX512BW-LABEL: PR32160:
1872 ; AVX512BW: # %bb.0:
1873 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1874 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
1875 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
1876 ; AVX512BW-NEXT: vzeroupper
1877 ; AVX512BW-NEXT: retq
1879 ; AVX512BWVL-LABEL: PR32160:
1880 ; AVX512BWVL: # %bb.0:
1881 ; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0
1882 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
1883 ; AVX512BWVL-NEXT: vzeroupper
1884 ; AVX512BWVL-NEXT: retq
1885 %shuf = trunc <8 x i32> %x to <8 x i16>
1886 %trunc = shufflevector <8 x i16> %shuf, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
1887 ret <8 x i16> %trunc
1890 define void @PR34773(i16* %a0, i8* %a1) {
1891 ; SSE-LABEL: PR34773:
1893 ; SSE-NEXT: movdqu (%rdi), %xmm0
1894 ; SSE-NEXT: movdqu 16(%rdi), %xmm1
1895 ; SSE-NEXT: movdqu 32(%rdi), %xmm2
1896 ; SSE-NEXT: movdqu 48(%rdi), %xmm3
1897 ; SSE-NEXT: psrlw $8, %xmm1
1898 ; SSE-NEXT: psrlw $8, %xmm0
1899 ; SSE-NEXT: packuswb %xmm1, %xmm0
1900 ; SSE-NEXT: psrlw $8, %xmm3
1901 ; SSE-NEXT: psrlw $8, %xmm2
1902 ; SSE-NEXT: packuswb %xmm3, %xmm2
1903 ; SSE-NEXT: movdqu %xmm0, (%rsi)
1904 ; SSE-NEXT: movdqu %xmm2, 16(%rsi)
1907 ; AVX1-LABEL: PR34773:
1909 ; AVX1-NEXT: vmovdqu (%rdi), %xmm0
1910 ; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1
1911 ; AVX1-NEXT: vmovdqu 32(%rdi), %xmm2
1912 ; AVX1-NEXT: vmovdqu 48(%rdi), %xmm3
1913 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
1914 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
1915 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1916 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm1
1917 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
1918 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
1919 ; AVX1-NEXT: vmovdqu %xmm0, (%rsi)
1920 ; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi)
1923 ; AVX2-LABEL: PR34773:
1925 ; AVX2-NEXT: vmovdqu (%rdi), %ymm0
1926 ; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1
1927 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
1928 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
1929 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
1930 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1931 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
1932 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
1933 ; AVX2-NEXT: vmovdqu %xmm0, (%rsi)
1934 ; AVX2-NEXT: vmovdqu %xmm1, 16(%rsi)
1935 ; AVX2-NEXT: vzeroupper
1938 ; AVX512F-LABEL: PR34773:
1940 ; AVX512F-NEXT: vmovdqu (%rdi), %ymm0
1941 ; AVX512F-NEXT: vmovdqu 32(%rdi), %ymm1
1942 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
1943 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
1944 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1945 ; AVX512F-NEXT: vpmovdb %zmm0, (%rsi)
1946 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1947 ; AVX512F-NEXT: vpmovdb %zmm0, 16(%rsi)
1948 ; AVX512F-NEXT: vzeroupper
1949 ; AVX512F-NEXT: retq
1951 ; AVX512VL-LABEL: PR34773:
1952 ; AVX512VL: # %bb.0:
1953 ; AVX512VL-NEXT: vmovdqu (%rdi), %ymm0
1954 ; AVX512VL-NEXT: vmovdqu 32(%rdi), %ymm1
1955 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
1956 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
1957 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1958 ; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi)
1959 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1960 ; AVX512VL-NEXT: vpmovdb %zmm0, 16(%rsi)
1961 ; AVX512VL-NEXT: vzeroupper
1962 ; AVX512VL-NEXT: retq
1964 ; AVX512BW-LABEL: PR34773:
1965 ; AVX512BW: # %bb.0:
1966 ; AVX512BW-NEXT: vmovdqu (%rdi), %ymm0
1967 ; AVX512BW-NEXT: vmovdqu 32(%rdi), %ymm1
1968 ; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0
1969 ; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
1970 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1971 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
1972 ; AVX512BW-NEXT: vmovdqu %xmm0, (%rsi)
1973 ; AVX512BW-NEXT: vmovdqu %xmm1, 16(%rsi)
1974 ; AVX512BW-NEXT: vzeroupper
1975 ; AVX512BW-NEXT: retq
1977 ; AVX512BWVL-LABEL: PR34773:
1978 ; AVX512BWVL: # %bb.0:
1979 ; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %ymm0
1980 ; AVX512BWVL-NEXT: vpsrlw $8, 32(%rdi), %ymm1
1981 ; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi)
1982 ; AVX512BWVL-NEXT: vpmovwb %ymm1, 16(%rsi)
1983 ; AVX512BWVL-NEXT: vzeroupper
1984 ; AVX512BWVL-NEXT: retq
1985 %1 = getelementptr i16, i16* %a0, i64 16
1986 %2 = getelementptr i8, i8* %a1, i64 16
1987 %3 = bitcast i16* %a0 to <16 x i16>*
1988 %4 = bitcast i16* %1 to <16 x i16>*
1989 %5 = bitcast i8* %a1 to <16 x i8>*
1990 %6 = bitcast i8* %2 to <16 x i8>*
1991 %7 = load <16 x i16>, <16 x i16>* %3, align 2
1992 %8 = load <16 x i16>, <16 x i16>* %4, align 2
1993 %9 = lshr <16 x i16> %7, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1994 %10 = lshr <16 x i16> %8, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1995 %11 = trunc <16 x i16> %9 to <16 x i8>
1996 %12 = trunc <16 x i16> %10 to <16 x i8>
1997 store <16 x i8> %11, <16 x i8>* %5, align 1
1998 store <16 x i8> %12, <16 x i8>* %6, align 1