1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMIVL
12 ; Pairs of shufflevector:trunc functions with functional equivalence.
13 ; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
15 define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
16 ; AVX-LABEL: shuffle_v32i8_to_v16i8:
18 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
19 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
20 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
21 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
22 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
23 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
24 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
27 ; AVX512F-LABEL: shuffle_v32i8_to_v16i8:
29 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
30 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
31 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
32 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
33 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
34 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
35 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
38 ; AVX512VL-LABEL: shuffle_v32i8_to_v16i8:
40 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
41 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
42 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
43 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
44 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
45 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
46 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
49 ; AVX512BW-LABEL: shuffle_v32i8_to_v16i8:
51 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
52 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
53 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
54 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
55 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
56 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
57 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
60 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8:
61 ; AVX512BWVL: # %bb.0:
62 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
63 ; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
64 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
65 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
66 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
67 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
68 ; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi)
69 ; AVX512BWVL-NEXT: retq
71 ; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v16i8:
72 ; AVX512VBMIVL: # %bb.0:
73 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
74 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
75 ; AVX512VBMIVL-NEXT: vpermi2b 16(%rdi), %xmm0, %xmm1
76 ; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
77 ; AVX512VBMIVL-NEXT: retq
78 %vec = load <32 x i8>, <32 x i8>* %L
79 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
80 store <16 x i8> %strided.vec, <16 x i8>* %S
84 define void @trunc_v16i16_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
85 ; AVX1-LABEL: trunc_v16i16_to_v16i8:
87 ; AVX1-NEXT: vmovaps (%rdi), %ymm0
88 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
89 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
90 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
91 ; AVX1-NEXT: vmovdqa %xmm0, (%rsi)
92 ; AVX1-NEXT: vzeroupper
95 ; AVX2-LABEL: trunc_v16i16_to_v16i8:
97 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
98 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
99 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
100 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
101 ; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
102 ; AVX2-NEXT: vzeroupper
105 ; AVX512F-LABEL: trunc_v16i16_to_v16i8:
107 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
108 ; AVX512F-NEXT: vpmovdb %zmm0, (%rsi)
109 ; AVX512F-NEXT: vzeroupper
112 ; AVX512VL-LABEL: trunc_v16i16_to_v16i8:
114 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
115 ; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi)
116 ; AVX512VL-NEXT: vzeroupper
117 ; AVX512VL-NEXT: retq
119 ; AVX512BW-LABEL: trunc_v16i16_to_v16i8:
121 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
122 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
123 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
124 ; AVX512BW-NEXT: vzeroupper
125 ; AVX512BW-NEXT: retq
127 ; AVX512BWVL-LABEL: trunc_v16i16_to_v16i8:
128 ; AVX512BWVL: # %bb.0:
129 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
130 ; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi)
131 ; AVX512BWVL-NEXT: vzeroupper
132 ; AVX512BWVL-NEXT: retq
134 ; AVX512VBMIVL-LABEL: trunc_v16i16_to_v16i8:
135 ; AVX512VBMIVL: # %bb.0:
136 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
137 ; AVX512VBMIVL-NEXT: vpmovwb %ymm0, (%rsi)
138 ; AVX512VBMIVL-NEXT: vzeroupper
139 ; AVX512VBMIVL-NEXT: retq
140 %vec = load <32 x i8>, <32 x i8>* %L
141 %bc = bitcast <32 x i8> %vec to <16 x i16>
142 %strided.vec = trunc <16 x i16> %bc to <16 x i8>
143 store <16 x i8> %strided.vec, <16 x i8>* %S
147 define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
148 ; AVX-LABEL: shuffle_v16i16_to_v8i16:
150 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
151 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
152 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
153 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
154 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
155 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
156 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
159 ; AVX512F-LABEL: shuffle_v16i16_to_v8i16:
161 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
162 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
163 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
164 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
165 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
166 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
167 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
170 ; AVX512VL-LABEL: shuffle_v16i16_to_v8i16:
172 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
173 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
174 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
175 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
176 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
177 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
178 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
179 ; AVX512VL-NEXT: retq
181 ; AVX512BW-LABEL: shuffle_v16i16_to_v8i16:
183 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
184 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
185 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
186 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
187 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
188 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
189 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
190 ; AVX512BW-NEXT: retq
192 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16:
193 ; AVX512BWVL: # %bb.0:
194 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
195 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14]
196 ; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
197 ; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi)
198 ; AVX512BWVL-NEXT: retq
200 ; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v8i16:
201 ; AVX512VBMIVL: # %bb.0:
202 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
203 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14]
204 ; AVX512VBMIVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
205 ; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
206 ; AVX512VBMIVL-NEXT: retq
207 %vec = load <16 x i16>, <16 x i16>* %L
208 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
209 store <8 x i16> %strided.vec, <8 x i16>* %S
213 define void @trunc_v8i32_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
214 ; AVX1-LABEL: trunc_v8i32_to_v8i16:
216 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
217 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
218 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
219 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
220 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
221 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
222 ; AVX1-NEXT: vmovdqa %xmm0, (%rsi)
225 ; AVX2-LABEL: trunc_v8i32_to_v8i16:
227 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
228 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
229 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
230 ; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
231 ; AVX2-NEXT: vzeroupper
234 ; AVX512F-LABEL: trunc_v8i32_to_v8i16:
236 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
237 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
238 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
239 ; AVX512F-NEXT: vzeroupper
242 ; AVX512VL-LABEL: trunc_v8i32_to_v8i16:
244 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
245 ; AVX512VL-NEXT: vpmovdw %ymm0, (%rsi)
246 ; AVX512VL-NEXT: vzeroupper
247 ; AVX512VL-NEXT: retq
249 ; AVX512BW-LABEL: trunc_v8i32_to_v8i16:
251 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
252 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
253 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
254 ; AVX512BW-NEXT: vzeroupper
255 ; AVX512BW-NEXT: retq
257 ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i16:
258 ; AVX512BWVL: # %bb.0:
259 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
260 ; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rsi)
261 ; AVX512BWVL-NEXT: vzeroupper
262 ; AVX512BWVL-NEXT: retq
264 ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i16:
265 ; AVX512VBMIVL: # %bb.0:
266 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
267 ; AVX512VBMIVL-NEXT: vpmovdw %ymm0, (%rsi)
268 ; AVX512VBMIVL-NEXT: vzeroupper
269 ; AVX512VBMIVL-NEXT: retq
270 %vec = load <16 x i16>, <16 x i16>* %L
271 %bc = bitcast <16 x i16> %vec to <8 x i32>
272 %strided.vec = trunc <8 x i32> %bc to <8 x i16>
273 store <8 x i16> %strided.vec, <8 x i16>* %S
277 define void @shuffle_v8i32_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
278 ; AVX-LABEL: shuffle_v8i32_to_v4i32:
280 ; AVX-NEXT: vmovaps (%rdi), %xmm0
281 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
282 ; AVX-NEXT: vmovaps %xmm0, (%rsi)
285 ; AVX512-LABEL: shuffle_v8i32_to_v4i32:
287 ; AVX512-NEXT: vmovaps (%rdi), %xmm0
288 ; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
289 ; AVX512-NEXT: vmovaps %xmm0, (%rsi)
291 %vec = load <8 x i32>, <8 x i32>* %L
292 %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
293 store <4 x i32> %strided.vec, <4 x i32>* %S
297 define void @trunc_v4i64_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
298 ; AVX1-LABEL: trunc_v4i64_to_v4i32:
300 ; AVX1-NEXT: vmovaps (%rdi), %xmm0
301 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
302 ; AVX1-NEXT: vmovaps %xmm0, (%rsi)
305 ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i32:
306 ; AVX2-SLOW: # %bb.0:
307 ; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0
308 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
309 ; AVX2-SLOW-NEXT: vmovaps %xmm0, (%rsi)
310 ; AVX2-SLOW-NEXT: retq
312 ; AVX2-FAST-LABEL: trunc_v4i64_to_v4i32:
313 ; AVX2-FAST: # %bb.0:
314 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7]
315 ; AVX2-FAST-NEXT: vpermps (%rdi), %ymm0, %ymm0
316 ; AVX2-FAST-NEXT: vmovaps %xmm0, (%rsi)
317 ; AVX2-FAST-NEXT: vzeroupper
318 ; AVX2-FAST-NEXT: retq
320 ; AVX512F-LABEL: trunc_v4i64_to_v4i32:
322 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
323 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
324 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
325 ; AVX512F-NEXT: vzeroupper
328 ; AVX512VL-LABEL: trunc_v4i64_to_v4i32:
330 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
331 ; AVX512VL-NEXT: vpmovqd %ymm0, (%rsi)
332 ; AVX512VL-NEXT: vzeroupper
333 ; AVX512VL-NEXT: retq
335 ; AVX512BW-LABEL: trunc_v4i64_to_v4i32:
337 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
338 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
339 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
340 ; AVX512BW-NEXT: vzeroupper
341 ; AVX512BW-NEXT: retq
343 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i32:
344 ; AVX512BWVL: # %bb.0:
345 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
346 ; AVX512BWVL-NEXT: vpmovqd %ymm0, (%rsi)
347 ; AVX512BWVL-NEXT: vzeroupper
348 ; AVX512BWVL-NEXT: retq
350 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i32:
351 ; AVX512VBMIVL: # %bb.0:
352 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
353 ; AVX512VBMIVL-NEXT: vpmovqd %ymm0, (%rsi)
354 ; AVX512VBMIVL-NEXT: vzeroupper
355 ; AVX512VBMIVL-NEXT: retq
356 %vec = load <8 x i32>, <8 x i32>* %L
357 %bc = bitcast <8 x i32> %vec to <4 x i64>
358 %strided.vec = trunc <4 x i64> %bc to <4 x i32>
359 store <4 x i32> %strided.vec, <4 x i32>* %S
363 define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
364 ; AVX-LABEL: shuffle_v32i8_to_v8i8:
366 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
367 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
368 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
369 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
370 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
371 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
372 ; AVX-NEXT: vmovq %xmm0, (%rsi)
375 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8:
377 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
378 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
379 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
380 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
381 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
382 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
383 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
386 ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8:
388 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
389 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
390 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
391 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
392 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
393 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
394 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
395 ; AVX512VL-NEXT: retq
397 ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8:
399 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
400 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
401 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
402 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
403 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
404 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
405 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
406 ; AVX512BW-NEXT: retq
408 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8:
409 ; AVX512BWVL: # %bb.0:
410 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
411 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14]
412 ; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
413 ; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi)
414 ; AVX512BWVL-NEXT: retq
416 ; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v8i8:
417 ; AVX512VBMIVL: # %bb.0:
418 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
419 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14]
420 ; AVX512VBMIVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
421 ; AVX512VBMIVL-NEXT: vpmovwb %xmm1, (%rsi)
422 ; AVX512VBMIVL-NEXT: retq
423 %vec = load <32 x i8>, <32 x i8>* %L
424 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
425 store <8 x i8> %strided.vec, <8 x i8>* %S
429 define void @trunc_v8i32_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
430 ; AVX1-LABEL: trunc_v8i32_to_v8i8:
432 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
433 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
434 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
435 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
436 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
437 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
438 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
441 ; AVX2-LABEL: trunc_v8i32_to_v8i8:
443 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
444 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
445 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
446 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
447 ; AVX2-NEXT: vmovq %xmm0, (%rsi)
448 ; AVX2-NEXT: vzeroupper
451 ; AVX512F-LABEL: trunc_v8i32_to_v8i8:
453 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
454 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
455 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
456 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
457 ; AVX512F-NEXT: vzeroupper
460 ; AVX512VL-LABEL: trunc_v8i32_to_v8i8:
462 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
463 ; AVX512VL-NEXT: vpmovdb %ymm0, (%rsi)
464 ; AVX512VL-NEXT: vzeroupper
465 ; AVX512VL-NEXT: retq
467 ; AVX512BW-LABEL: trunc_v8i32_to_v8i8:
469 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
470 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
471 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
472 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
473 ; AVX512BW-NEXT: vzeroupper
474 ; AVX512BW-NEXT: retq
476 ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8:
477 ; AVX512BWVL: # %bb.0:
478 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
479 ; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rsi)
480 ; AVX512BWVL-NEXT: vzeroupper
481 ; AVX512BWVL-NEXT: retq
483 ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8:
484 ; AVX512VBMIVL: # %bb.0:
485 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
486 ; AVX512VBMIVL-NEXT: vpmovdb %ymm0, (%rsi)
487 ; AVX512VBMIVL-NEXT: vzeroupper
488 ; AVX512VBMIVL-NEXT: retq
489 %vec = load <32 x i8>, <32 x i8>* %L
490 %bc = bitcast <32 x i8> %vec to <8 x i32>
491 %strided.vec = trunc <8 x i32> %bc to <8 x i8>
492 store <8 x i8> %strided.vec, <8 x i8>* %S
496 define <2 x i64> @trunc_v8i32_to_v8i8_return_v2i64(<8 x i32> %vec) nounwind {
498 ; return (__m128i) {(long long)__builtin_convertvector((__v8si)__A, __v8qi), 0};
499 ; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
501 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
502 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
503 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
504 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
505 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
506 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
507 ; AVX1-NEXT: vzeroupper
510 ; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
512 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
513 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
514 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
515 ; AVX2-NEXT: vzeroupper
518 ; AVX512F-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
520 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
521 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
522 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
523 ; AVX512F-NEXT: vzeroupper
526 ; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
528 ; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
529 ; AVX512VL-NEXT: vzeroupper
530 ; AVX512VL-NEXT: retq
532 ; AVX512BW-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
534 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
535 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
536 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
537 ; AVX512BW-NEXT: vzeroupper
538 ; AVX512BW-NEXT: retq
540 ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
541 ; AVX512BWVL: # %bb.0:
542 ; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
543 ; AVX512BWVL-NEXT: vzeroupper
544 ; AVX512BWVL-NEXT: retq
546 ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
547 ; AVX512VBMIVL: # %bb.0:
548 ; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0
549 ; AVX512VBMIVL-NEXT: vzeroupper
550 ; AVX512VBMIVL-NEXT: retq
551 %truncated.vec = trunc <8 x i32> %vec to <8 x i8>
552 %bc = bitcast <8 x i8> %truncated.vec to i64
553 %result = insertelement <2 x i64> zeroinitializer, i64 %bc, i32 0
554 ret <2 x i64> %result
557 define <16 x i8> @trunc_v8i32_to_v8i8_with_zext_return_v16i8(<8 x i32> %vec) nounwind {
558 ; AVX1-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
560 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
561 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
562 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
563 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
564 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
565 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
566 ; AVX1-NEXT: vzeroupper
569 ; AVX2-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
571 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
572 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
573 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
574 ; AVX2-NEXT: vzeroupper
577 ; AVX512F-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
579 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
580 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
581 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
582 ; AVX512F-NEXT: vzeroupper
585 ; AVX512VL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
587 ; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
588 ; AVX512VL-NEXT: vzeroupper
589 ; AVX512VL-NEXT: retq
591 ; AVX512BW-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
593 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
594 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
595 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
596 ; AVX512BW-NEXT: vzeroupper
597 ; AVX512BW-NEXT: retq
599 ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
600 ; AVX512BWVL: # %bb.0:
601 ; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
602 ; AVX512BWVL-NEXT: vzeroupper
603 ; AVX512BWVL-NEXT: retq
605 ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
606 ; AVX512VBMIVL: # %bb.0:
607 ; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0
608 ; AVX512VBMIVL-NEXT: vzeroupper
609 ; AVX512VBMIVL-NEXT: retq
610 %truncated = trunc <8 x i32> %vec to <8 x i8>
611 %truncated.ext = zext <8 x i8> %truncated to <8 x i16>
612 %bc = bitcast <8 x i16> %truncated.ext to <16 x i8>
613 %result = shufflevector <16 x i8> %bc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
614 ret <16 x i8> %result
617 define <16 x i8> @trunc_v8i32_to_v8i8_via_v8i16_return_v16i8(<8 x i32> %vec) nounwind {
618 ; AVX1-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
620 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
621 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
622 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
623 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
624 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
625 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
626 ; AVX1-NEXT: vzeroupper
629 ; AVX2-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
631 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
632 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
633 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
634 ; AVX2-NEXT: vzeroupper
637 ; AVX512F-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
639 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
640 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
641 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
642 ; AVX512F-NEXT: vzeroupper
645 ; AVX512VL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
647 ; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
648 ; AVX512VL-NEXT: vzeroupper
649 ; AVX512VL-NEXT: retq
651 ; AVX512BW-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
653 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
654 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
655 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
656 ; AVX512BW-NEXT: vzeroupper
657 ; AVX512BW-NEXT: retq
659 ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
660 ; AVX512BWVL: # %bb.0:
661 ; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
662 ; AVX512BWVL-NEXT: vzeroupper
663 ; AVX512BWVL-NEXT: retq
665 ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
666 ; AVX512VBMIVL: # %bb.0:
667 ; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0
668 ; AVX512VBMIVL-NEXT: vzeroupper
669 ; AVX512VBMIVL-NEXT: retq
670 %truncated = trunc <8 x i32> %vec to <8 x i16>
671 %bc = bitcast <8 x i16> %truncated to <16 x i8>
672 %result = shufflevector <16 x i8> %bc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 17, i32 20, i32 24, i32 22, i32 31, i32 28, i32 28, i32 29>
673 ret <16 x i8> %result
676 define <16 x i8> @trunc_v8i32_to_v8i8_return_v16i8(<8 x i32> %vec) nounwind {
677 ; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
679 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
680 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
681 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
682 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
683 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
684 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
685 ; AVX1-NEXT: vzeroupper
688 ; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
690 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
691 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
692 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
693 ; AVX2-NEXT: vzeroupper
696 ; AVX512F-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
698 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
699 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
700 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
701 ; AVX512F-NEXT: vzeroupper
704 ; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
706 ; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
707 ; AVX512VL-NEXT: vzeroupper
708 ; AVX512VL-NEXT: retq
710 ; AVX512BW-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
712 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
713 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
714 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
715 ; AVX512BW-NEXT: vzeroupper
716 ; AVX512BW-NEXT: retq
718 ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
719 ; AVX512BWVL: # %bb.0:
720 ; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
721 ; AVX512BWVL-NEXT: vzeroupper
722 ; AVX512BWVL-NEXT: retq
724 ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
725 ; AVX512VBMIVL: # %bb.0:
726 ; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0
727 ; AVX512VBMIVL-NEXT: vzeroupper
728 ; AVX512VBMIVL-NEXT: retq
729 %truncated = trunc <8 x i32> %vec to <8 x i8>
730 %result = shufflevector <8 x i8> %truncated, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
731 ret <16 x i8> %result
734 define <2 x i64> @trunc_v4i64_to_v4i16_return_v2i64(<4 x i64> %vec) nounwind {
736 ; return (__m128i) {(long long)__builtin_convertvector((__v4di)x, __v4hi), 0};
737 ; AVX1-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
739 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
740 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
741 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
742 ; AVX1-NEXT: vzeroupper
745 ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
746 ; AVX2-SLOW: # %bb.0:
747 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
748 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
749 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
750 ; AVX2-SLOW-NEXT: vzeroupper
751 ; AVX2-SLOW-NEXT: retq
753 ; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
754 ; AVX2-FAST: # %bb.0:
755 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
756 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
757 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
758 ; AVX2-FAST-NEXT: vzeroupper
759 ; AVX2-FAST-NEXT: retq
761 ; AVX512F-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
763 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
764 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
765 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
766 ; AVX512F-NEXT: vzeroupper
769 ; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
771 ; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0
772 ; AVX512VL-NEXT: vzeroupper
773 ; AVX512VL-NEXT: retq
775 ; AVX512BW-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
777 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
778 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
779 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
780 ; AVX512BW-NEXT: vzeroupper
781 ; AVX512BW-NEXT: retq
783 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
784 ; AVX512BWVL: # %bb.0:
785 ; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0
786 ; AVX512BWVL-NEXT: vzeroupper
787 ; AVX512BWVL-NEXT: retq
789 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
790 ; AVX512VBMIVL: # %bb.0:
791 ; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0
792 ; AVX512VBMIVL-NEXT: vzeroupper
793 ; AVX512VBMIVL-NEXT: retq
794 %truncated = trunc <4 x i64> %vec to <4 x i16>
795 %bc = bitcast <4 x i16> %truncated to i64
796 %result = insertelement <2 x i64> zeroinitializer, i64 %bc, i32 0
797 ret <2 x i64> %result
800 define <8 x i16> @trunc_v4i64_to_v4i16_with_zext_return_v8i16(<4 x i64> %vec) nounwind {
801 ; AVX1-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
803 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
804 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
805 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
806 ; AVX1-NEXT: vzeroupper
809 ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
810 ; AVX2-SLOW: # %bb.0:
811 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
812 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
813 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
814 ; AVX2-SLOW-NEXT: vzeroupper
815 ; AVX2-SLOW-NEXT: retq
817 ; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
818 ; AVX2-FAST: # %bb.0:
819 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
820 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
821 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
822 ; AVX2-FAST-NEXT: vzeroupper
823 ; AVX2-FAST-NEXT: retq
825 ; AVX512F-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
827 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
828 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
829 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
830 ; AVX512F-NEXT: vzeroupper
833 ; AVX512VL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
835 ; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0
836 ; AVX512VL-NEXT: vzeroupper
837 ; AVX512VL-NEXT: retq
839 ; AVX512BW-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
841 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
842 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
843 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
844 ; AVX512BW-NEXT: vzeroupper
845 ; AVX512BW-NEXT: retq
847 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
848 ; AVX512BWVL: # %bb.0:
849 ; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0
850 ; AVX512BWVL-NEXT: vzeroupper
851 ; AVX512BWVL-NEXT: retq
853 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
854 ; AVX512VBMIVL: # %bb.0:
855 ; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0
856 ; AVX512VBMIVL-NEXT: vzeroupper
857 ; AVX512VBMIVL-NEXT: retq
858 %truncated = trunc <4 x i64> %vec to <4 x i16>
859 %truncated.ext = zext <4 x i16> %truncated to <4 x i32>
860 %bc = bitcast <4 x i32> %truncated.ext to <8 x i16>
861 %result = shufflevector <8 x i16> %bc, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
862 ret <8 x i16> %result
865 define <8 x i16> @trunc_v4i64_to_v4i16_via_v4i32_return_v8i16(<4 x i64> %vec) nounwind {
866 ; AVX1-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
868 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
869 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
870 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
871 ; AVX1-NEXT: vzeroupper
874 ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
875 ; AVX2-SLOW: # %bb.0:
876 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
877 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
878 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
879 ; AVX2-SLOW-NEXT: vzeroupper
880 ; AVX2-SLOW-NEXT: retq
882 ; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
883 ; AVX2-FAST: # %bb.0:
884 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
885 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
886 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
887 ; AVX2-FAST-NEXT: vzeroupper
888 ; AVX2-FAST-NEXT: retq
890 ; AVX512F-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
892 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
893 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
894 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
895 ; AVX512F-NEXT: vzeroupper
898 ; AVX512VL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
900 ; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0
901 ; AVX512VL-NEXT: vzeroupper
902 ; AVX512VL-NEXT: retq
904 ; AVX512BW-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
906 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
907 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
908 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
909 ; AVX512BW-NEXT: vzeroupper
910 ; AVX512BW-NEXT: retq
912 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
913 ; AVX512BWVL: # %bb.0:
914 ; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0
915 ; AVX512BWVL-NEXT: vzeroupper
916 ; AVX512BWVL-NEXT: retq
918 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
919 ; AVX512VBMIVL: # %bb.0:
920 ; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0
921 ; AVX512VBMIVL-NEXT: vzeroupper
922 ; AVX512VBMIVL-NEXT: retq
923 %truncated = trunc <4 x i64> %vec to <4 x i32>
924 %bc = bitcast <4 x i32> %truncated to <8 x i16>
925 %result = shufflevector <8 x i16> %bc, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 8, i32 undef, i32 13>
926 ret <8 x i16> %result
929 define <8 x i16> @trunc_v4i64_to_v4i16_return_v8i16(<4 x i64> %vec) nounwind {
930 ; AVX1-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
932 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
933 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
934 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
935 ; AVX1-NEXT: vzeroupper
938 ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
939 ; AVX2-SLOW: # %bb.0:
940 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
941 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
942 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
943 ; AVX2-SLOW-NEXT: vzeroupper
944 ; AVX2-SLOW-NEXT: retq
946 ; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
947 ; AVX2-FAST: # %bb.0:
948 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
949 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
950 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
951 ; AVX2-FAST-NEXT: vzeroupper
952 ; AVX2-FAST-NEXT: retq
954 ; AVX512F-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
956 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
957 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
958 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
959 ; AVX512F-NEXT: vzeroupper
962 ; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
964 ; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0
965 ; AVX512VL-NEXT: vzeroupper
966 ; AVX512VL-NEXT: retq
968 ; AVX512BW-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
970 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
971 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
972 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
973 ; AVX512BW-NEXT: vzeroupper
974 ; AVX512BW-NEXT: retq
976 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
977 ; AVX512BWVL: # %bb.0:
978 ; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0
979 ; AVX512BWVL-NEXT: vzeroupper
980 ; AVX512BWVL-NEXT: retq
982 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
983 ; AVX512VBMIVL: # %bb.0:
984 ; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0
985 ; AVX512VBMIVL-NEXT: vzeroupper
986 ; AVX512VBMIVL-NEXT: retq
987 %truncated = trunc <4 x i64> %vec to <4 x i16>
988 %result = shufflevector <4 x i16> %truncated, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
989 ret <8 x i16> %result
992 define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind {
993 ; AVX1-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
995 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
996 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
997 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero
998 ; AVX1-NEXT: vzeroupper
1001 ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
1002 ; AVX2-SLOW: # %bb.0:
1003 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
1004 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1005 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero
1006 ; AVX2-SLOW-NEXT: vzeroupper
1007 ; AVX2-SLOW-NEXT: retq
1009 ; AVX2-FAST-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
1010 ; AVX2-FAST: # %bb.0:
1011 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
1012 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
1013 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero
1014 ; AVX2-FAST-NEXT: vzeroupper
1015 ; AVX2-FAST-NEXT: retq
1017 ; AVX512F-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
1019 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1020 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
1021 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero
1022 ; AVX512F-NEXT: vzeroupper
1023 ; AVX512F-NEXT: retq
1025 ; AVX512VL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
1026 ; AVX512VL: # %bb.0:
1027 ; AVX512VL-NEXT: vpmovqb %ymm0, %xmm0
1028 ; AVX512VL-NEXT: vzeroupper
1029 ; AVX512VL-NEXT: retq
1031 ; AVX512BW-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
1032 ; AVX512BW: # %bb.0:
1033 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1034 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
1035 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero
1036 ; AVX512BW-NEXT: vzeroupper
1037 ; AVX512BW-NEXT: retq
1039 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
1040 ; AVX512BWVL: # %bb.0:
1041 ; AVX512BWVL-NEXT: vpmovqb %ymm0, %xmm0
1042 ; AVX512BWVL-NEXT: vzeroupper
1043 ; AVX512BWVL-NEXT: retq
1045 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
1046 ; AVX512VBMIVL: # %bb.0:
1047 ; AVX512VBMIVL-NEXT: vpmovqb %ymm0, %xmm0
1048 ; AVX512VBMIVL-NEXT: vzeroupper
1049 ; AVX512VBMIVL-NEXT: retq
1050 %truncated = trunc <4 x i64> %vec to <4 x i8>
1051 %result = shufflevector <4 x i8> %truncated, <4 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 5, i32 5, i32 undef, i32 7>
1052 ret <16 x i8> %result
1055 define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
1056 ; AVX1-LABEL: shuffle_v16i16_to_v4i16:
1058 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
1059 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1060 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
1061 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1062 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1063 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
1066 ; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16:
1067 ; AVX2-SLOW: # %bb.0:
1068 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
1069 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1070 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
1071 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1072 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1073 ; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi)
1074 ; AVX2-SLOW-NEXT: retq
1076 ; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16:
1077 ; AVX2-FAST: # %bb.0:
1078 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
1079 ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
1080 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
1081 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1082 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1083 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1084 ; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi)
1085 ; AVX2-FAST-NEXT: retq
1087 ; AVX512F-LABEL: shuffle_v16i16_to_v4i16:
1089 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
1090 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1091 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
1092 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1093 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1094 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
1095 ; AVX512F-NEXT: retq
1097 ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16:
1098 ; AVX512VL: # %bb.0:
1099 ; AVX512VL-NEXT: vmovaps (%rdi), %xmm0
1100 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
1101 ; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
1102 ; AVX512VL-NEXT: retq
1104 ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16:
1105 ; AVX512BW: # %bb.0:
1106 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
1107 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
1108 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
1109 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1110 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1111 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1112 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
1113 ; AVX512BW-NEXT: retq
1115 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16:
1116 ; AVX512BWVL: # %bb.0:
1117 ; AVX512BWVL-NEXT: vmovaps (%rdi), %xmm0
1118 ; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
1119 ; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
1120 ; AVX512BWVL-NEXT: retq
1122 ; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v4i16:
1123 ; AVX512VBMIVL: # %bb.0:
1124 ; AVX512VBMIVL-NEXT: vmovaps (%rdi), %xmm0
1125 ; AVX512VBMIVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
1126 ; AVX512VBMIVL-NEXT: vpmovdw %xmm0, (%rsi)
1127 ; AVX512VBMIVL-NEXT: retq
1128 %vec = load <16 x i16>, <16 x i16>* %L
1129 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
1130 store <4 x i16> %strided.vec, <4 x i16>* %S
1134 define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
1135 ; AVX1-LABEL: trunc_v4i64_to_v4i16:
1137 ; AVX1-NEXT: vmovaps (%rdi), %xmm0
1138 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
1139 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1140 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
1143 ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16:
1144 ; AVX2-SLOW: # %bb.0:
1145 ; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0
1146 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
1147 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1148 ; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi)
1149 ; AVX2-SLOW-NEXT: retq
1151 ; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16:
1152 ; AVX2-FAST: # %bb.0:
1153 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7]
1154 ; AVX2-FAST-NEXT: vpermd (%rdi), %ymm0, %ymm0
1155 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1156 ; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi)
1157 ; AVX2-FAST-NEXT: vzeroupper
1158 ; AVX2-FAST-NEXT: retq
1160 ; AVX512F-LABEL: trunc_v4i64_to_v4i16:
1162 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
1163 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
1164 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1165 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
1166 ; AVX512F-NEXT: vzeroupper
1167 ; AVX512F-NEXT: retq
1169 ; AVX512VL-LABEL: trunc_v4i64_to_v4i16:
1170 ; AVX512VL: # %bb.0:
1171 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
1172 ; AVX512VL-NEXT: vpmovqw %ymm0, (%rsi)
1173 ; AVX512VL-NEXT: vzeroupper
1174 ; AVX512VL-NEXT: retq
1176 ; AVX512BW-LABEL: trunc_v4i64_to_v4i16:
1177 ; AVX512BW: # %bb.0:
1178 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
1179 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
1180 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1181 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
1182 ; AVX512BW-NEXT: vzeroupper
1183 ; AVX512BW-NEXT: retq
1185 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16:
1186 ; AVX512BWVL: # %bb.0:
1187 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
1188 ; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rsi)
1189 ; AVX512BWVL-NEXT: vzeroupper
1190 ; AVX512BWVL-NEXT: retq
1192 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16:
1193 ; AVX512VBMIVL: # %bb.0:
1194 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
1195 ; AVX512VBMIVL-NEXT: vpmovqw %ymm0, (%rsi)
1196 ; AVX512VBMIVL-NEXT: vzeroupper
1197 ; AVX512VBMIVL-NEXT: retq
1198 %vec = load <16 x i16>, <16 x i16>* %L
1199 %bc = bitcast <16 x i16> %vec to <4 x i64>
1200 %strided.vec = trunc <4 x i64> %bc to <4 x i16>
1201 store <4 x i16> %strided.vec, <4 x i16>* %S
1205 define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
1206 ; AVX-LABEL: shuffle_v32i8_to_v4i8:
1208 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1209 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
1210 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1211 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1212 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1213 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1214 ; AVX-NEXT: vmovd %xmm0, (%rsi)
1217 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8:
1219 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
1220 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
1221 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1222 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1223 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1224 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1225 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
1226 ; AVX512F-NEXT: retq
1228 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8:
1229 ; AVX512VL: # %bb.0:
1230 ; AVX512VL-NEXT: vmovaps (%rdi), %xmm0
1231 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
1232 ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
1233 ; AVX512VL-NEXT: retq
1235 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8:
1236 ; AVX512BW: # %bb.0:
1237 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
1238 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
1239 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1240 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1241 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1242 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1243 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
1244 ; AVX512BW-NEXT: retq
1246 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8:
1247 ; AVX512BWVL: # %bb.0:
1248 ; AVX512BWVL-NEXT: vmovaps (%rdi), %xmm0
1249 ; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
1250 ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
1251 ; AVX512BWVL-NEXT: retq
1253 ; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v4i8:
1254 ; AVX512VBMIVL: # %bb.0:
1255 ; AVX512VBMIVL-NEXT: vmovaps (%rdi), %xmm0
1256 ; AVX512VBMIVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
1257 ; AVX512VBMIVL-NEXT: vpmovdb %xmm0, (%rsi)
1258 ; AVX512VBMIVL-NEXT: retq
1259 %vec = load <32 x i8>, <32 x i8>* %L
1260 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
1261 store <4 x i8> %strided.vec, <4 x i8>* %S
1265 define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
1266 ; AVX1-LABEL: trunc_v4i64_to_v4i8:
1268 ; AVX1-NEXT: vmovaps (%rdi), %xmm0
1269 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
1270 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
1271 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
1274 ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i8:
1275 ; AVX2-SLOW: # %bb.0:
1276 ; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0
1277 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
1278 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
1279 ; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi)
1280 ; AVX2-SLOW-NEXT: retq
1282 ; AVX2-FAST-LABEL: trunc_v4i64_to_v4i8:
1283 ; AVX2-FAST: # %bb.0:
1284 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7]
1285 ; AVX2-FAST-NEXT: vpermd (%rdi), %ymm0, %ymm0
1286 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
1287 ; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi)
1288 ; AVX2-FAST-NEXT: vzeroupper
1289 ; AVX2-FAST-NEXT: retq
1291 ; AVX512F-LABEL: trunc_v4i64_to_v4i8:
1293 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
1294 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
1295 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
1296 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
1297 ; AVX512F-NEXT: vzeroupper
1298 ; AVX512F-NEXT: retq
1300 ; AVX512VL-LABEL: trunc_v4i64_to_v4i8:
1301 ; AVX512VL: # %bb.0:
1302 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
1303 ; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi)
1304 ; AVX512VL-NEXT: vzeroupper
1305 ; AVX512VL-NEXT: retq
1307 ; AVX512BW-LABEL: trunc_v4i64_to_v4i8:
1308 ; AVX512BW: # %bb.0:
1309 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
1310 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
1311 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
1312 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
1313 ; AVX512BW-NEXT: vzeroupper
1314 ; AVX512BW-NEXT: retq
1316 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8:
1317 ; AVX512BWVL: # %bb.0:
1318 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
1319 ; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi)
1320 ; AVX512BWVL-NEXT: vzeroupper
1321 ; AVX512BWVL-NEXT: retq
1323 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i8:
1324 ; AVX512VBMIVL: # %bb.0:
1325 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
1326 ; AVX512VBMIVL-NEXT: vpmovqb %ymm0, (%rsi)
1327 ; AVX512VBMIVL-NEXT: vzeroupper
1328 ; AVX512VBMIVL-NEXT: retq
1329 %vec = load <32 x i8>, <32 x i8>* %L
1330 %bc = bitcast <32 x i8> %vec to <4 x i64>
1331 %strided.vec = trunc <4 x i64> %bc to <4 x i8>
1332 store <4 x i8> %strided.vec, <4 x i8>* %S
1336 ; In this case not all elements are collected from the same source vector, so
1337 ; the resulting BUILD_VECTOR should not be combined to a truncate.
1338 define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind {
1339 ; AVX1-LABEL: negative:
1341 ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
1342 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1343 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u],zero,zero,zero,zero,zero,zero,zero,xmm0[0,2,4,6,8,10,12,14]
1344 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1345 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1346 ; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
1347 ; AVX1-NEXT: vzeroupper
1350 ; AVX2-LABEL: negative:
1352 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
1353 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1354 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
1355 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1356 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1357 ; AVX2-NEXT: vzeroupper
1360 ; AVX512F-LABEL: negative:
1362 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
1363 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1364 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
1365 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1366 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1367 ; AVX512F-NEXT: vzeroupper
1368 ; AVX512F-NEXT: retq
1370 ; AVX512VL-LABEL: negative:
1371 ; AVX512VL: # %bb.0:
1372 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
1373 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1374 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
1375 ; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1376 ; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1377 ; AVX512VL-NEXT: vzeroupper
1378 ; AVX512VL-NEXT: retq
1380 ; AVX512BW-LABEL: negative:
1381 ; AVX512BW: # %bb.0:
1382 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
1383 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1384 ; AVX512BW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
1385 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1386 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1387 ; AVX512BW-NEXT: vzeroupper
1388 ; AVX512BW-NEXT: retq
1390 ; AVX512BWVL-LABEL: negative:
1391 ; AVX512BWVL: # %bb.0:
1392 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
1393 ; AVX512BWVL-NEXT: movl $65537, %eax # imm = 0x10001
1394 ; AVX512BWVL-NEXT: kmovd %eax, %k1
1395 ; AVX512BWVL-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
1396 ; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1397 ; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1398 ; AVX512BWVL-NEXT: vzeroupper
1399 ; AVX512BWVL-NEXT: retq
1401 ; AVX512VBMIVL-LABEL: negative:
1402 ; AVX512VBMIVL: # %bb.0:
1403 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm2 = [32,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,48,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
1404 ; AVX512VBMIVL-NEXT: vpermt2b %ymm1, %ymm2, %ymm0
1405 ; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1406 ; AVX512VBMIVL-NEXT: vzeroupper
1407 ; AVX512VBMIVL-NEXT: retq
1408 %strided.vec = shufflevector <32 x i8> %v, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
1409 %w0 = extractelement <32 x i8> %w, i32 0
1410 %merged = insertelement <16 x i8> %strided.vec, i8 %w0, i32 0
1411 ret <16 x i8> %merged