1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-ALL
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-PERLANE
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512VL
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512VL
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BWVL
12 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BWVL
13 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512VBMIVL
14 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512VBMIVL
17 ; Pairs of shufflevector:trunc functions with functional equivalence.
18 ; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
20 define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
21 ; AVX-LABEL: shuffle_v32i8_to_v16i8:
23 ; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
24 ; AVX-NEXT: vpand 16(%rdi), %xmm0, %xmm1
25 ; AVX-NEXT: vpand (%rdi), %xmm0, %xmm0
26 ; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
27 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
30 ; AVX512F-LABEL: shuffle_v32i8_to_v16i8:
32 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
33 ; AVX512F-NEXT: vpand 16(%rdi), %xmm0, %xmm1
34 ; AVX512F-NEXT: vpand (%rdi), %xmm0, %xmm0
35 ; AVX512F-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
36 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
39 ; AVX512VL-LABEL: shuffle_v32i8_to_v16i8:
41 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
42 ; AVX512VL-NEXT: vpand 16(%rdi), %xmm0, %xmm1
43 ; AVX512VL-NEXT: vpand (%rdi), %xmm0, %xmm0
44 ; AVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
45 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
48 ; AVX512BW-LABEL: shuffle_v32i8_to_v16i8:
50 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
51 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
52 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
53 ; AVX512BW-NEXT: vzeroupper
56 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8:
57 ; AVX512BWVL: # %bb.0:
58 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
59 ; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi)
60 ; AVX512BWVL-NEXT: vzeroupper
61 ; AVX512BWVL-NEXT: retq
63 ; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v16i8:
64 ; AVX512VBMIVL: # %bb.0:
65 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
66 ; AVX512VBMIVL-NEXT: vpmovwb %ymm0, (%rsi)
67 ; AVX512VBMIVL-NEXT: vzeroupper
68 ; AVX512VBMIVL-NEXT: retq
69 %vec = load <32 x i8>, <32 x i8>* %L
70 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
71 store <16 x i8> %strided.vec, <16 x i8>* %S
75 define void @trunc_v16i16_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
76 ; AVX1-LABEL: trunc_v16i16_to_v16i8:
78 ; AVX1-NEXT: vmovaps (%rdi), %ymm0
79 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
80 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
81 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
82 ; AVX1-NEXT: vmovdqa %xmm0, (%rsi)
83 ; AVX1-NEXT: vzeroupper
86 ; AVX2-LABEL: trunc_v16i16_to_v16i8:
88 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
89 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
90 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
91 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
92 ; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
93 ; AVX2-NEXT: vzeroupper
96 ; AVX512F-LABEL: trunc_v16i16_to_v16i8:
98 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
99 ; AVX512F-NEXT: vpmovdb %zmm0, (%rsi)
100 ; AVX512F-NEXT: vzeroupper
103 ; AVX512VL-LABEL: trunc_v16i16_to_v16i8:
105 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
106 ; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi)
107 ; AVX512VL-NEXT: vzeroupper
108 ; AVX512VL-NEXT: retq
110 ; AVX512BW-LABEL: trunc_v16i16_to_v16i8:
112 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
113 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
114 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
115 ; AVX512BW-NEXT: vzeroupper
116 ; AVX512BW-NEXT: retq
118 ; AVX512BWVL-LABEL: trunc_v16i16_to_v16i8:
119 ; AVX512BWVL: # %bb.0:
120 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
121 ; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi)
122 ; AVX512BWVL-NEXT: vzeroupper
123 ; AVX512BWVL-NEXT: retq
125 ; AVX512VBMIVL-LABEL: trunc_v16i16_to_v16i8:
126 ; AVX512VBMIVL: # %bb.0:
127 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
128 ; AVX512VBMIVL-NEXT: vpmovwb %ymm0, (%rsi)
129 ; AVX512VBMIVL-NEXT: vzeroupper
130 ; AVX512VBMIVL-NEXT: retq
131 %vec = load <32 x i8>, <32 x i8>* %L
132 %bc = bitcast <32 x i8> %vec to <16 x i16>
133 %strided.vec = trunc <16 x i16> %bc to <16 x i8>
134 store <16 x i8> %strided.vec, <16 x i8>* %S
138 define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
139 ; AVX-LABEL: shuffle_v16i16_to_v8i16:
141 ; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
142 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1],mem[2],xmm0[3],mem[4],xmm0[5],mem[6],xmm0[7]
143 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2],xmm0[3],mem[4],xmm0[5],mem[6],xmm0[7]
144 ; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
145 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
148 ; AVX512F-LABEL: shuffle_v16i16_to_v8i16:
150 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
151 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
152 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
153 ; AVX512F-NEXT: vzeroupper
156 ; AVX512VL-LABEL: shuffle_v16i16_to_v8i16:
158 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
159 ; AVX512VL-NEXT: vpmovdw %ymm0, (%rsi)
160 ; AVX512VL-NEXT: vzeroupper
161 ; AVX512VL-NEXT: retq
163 ; AVX512BW-LABEL: shuffle_v16i16_to_v8i16:
165 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
166 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
167 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
168 ; AVX512BW-NEXT: vzeroupper
169 ; AVX512BW-NEXT: retq
171 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16:
172 ; AVX512BWVL: # %bb.0:
173 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
174 ; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rsi)
175 ; AVX512BWVL-NEXT: vzeroupper
176 ; AVX512BWVL-NEXT: retq
178 ; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v8i16:
179 ; AVX512VBMIVL: # %bb.0:
180 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
181 ; AVX512VBMIVL-NEXT: vpmovdw %ymm0, (%rsi)
182 ; AVX512VBMIVL-NEXT: vzeroupper
183 ; AVX512VBMIVL-NEXT: retq
184 %vec = load <16 x i16>, <16 x i16>* %L
185 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
186 store <8 x i16> %strided.vec, <8 x i16>* %S
190 define void @trunc_v8i32_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
191 ; AVX1-LABEL: trunc_v8i32_to_v8i16:
193 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
194 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
195 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
196 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
197 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
198 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
199 ; AVX1-NEXT: vmovdqa %xmm0, (%rsi)
202 ; AVX2-LABEL: trunc_v8i32_to_v8i16:
204 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
205 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
206 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
207 ; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
208 ; AVX2-NEXT: vzeroupper
211 ; AVX512F-LABEL: trunc_v8i32_to_v8i16:
213 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
214 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
215 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
216 ; AVX512F-NEXT: vzeroupper
219 ; AVX512VL-LABEL: trunc_v8i32_to_v8i16:
221 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
222 ; AVX512VL-NEXT: vpmovdw %ymm0, (%rsi)
223 ; AVX512VL-NEXT: vzeroupper
224 ; AVX512VL-NEXT: retq
226 ; AVX512BW-LABEL: trunc_v8i32_to_v8i16:
228 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
229 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
230 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
231 ; AVX512BW-NEXT: vzeroupper
232 ; AVX512BW-NEXT: retq
234 ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i16:
235 ; AVX512BWVL: # %bb.0:
236 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
237 ; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rsi)
238 ; AVX512BWVL-NEXT: vzeroupper
239 ; AVX512BWVL-NEXT: retq
241 ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i16:
242 ; AVX512VBMIVL: # %bb.0:
243 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
244 ; AVX512VBMIVL-NEXT: vpmovdw %ymm0, (%rsi)
245 ; AVX512VBMIVL-NEXT: vzeroupper
246 ; AVX512VBMIVL-NEXT: retq
247 %vec = load <16 x i16>, <16 x i16>* %L
248 %bc = bitcast <16 x i16> %vec to <8 x i32>
249 %strided.vec = trunc <8 x i32> %bc to <8 x i16>
250 store <8 x i16> %strided.vec, <8 x i16>* %S
254 define void @shuffle_v8i32_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
255 ; AVX-LABEL: shuffle_v8i32_to_v4i32:
257 ; AVX-NEXT: vmovaps (%rdi), %xmm0
258 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
259 ; AVX-NEXT: vmovaps %xmm0, (%rsi)
262 ; AVX512F-LABEL: shuffle_v8i32_to_v4i32:
264 ; AVX512F-NEXT: vmovaps (%rdi), %xmm0
265 ; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
266 ; AVX512F-NEXT: vmovaps %xmm0, (%rsi)
269 ; AVX512VL-LABEL: shuffle_v8i32_to_v4i32:
271 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
272 ; AVX512VL-NEXT: vpmovqd %ymm0, (%rsi)
273 ; AVX512VL-NEXT: vzeroupper
274 ; AVX512VL-NEXT: retq
276 ; AVX512BW-LABEL: shuffle_v8i32_to_v4i32:
278 ; AVX512BW-NEXT: vmovaps (%rdi), %xmm0
279 ; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
280 ; AVX512BW-NEXT: vmovaps %xmm0, (%rsi)
281 ; AVX512BW-NEXT: retq
283 ; AVX512BWVL-LABEL: shuffle_v8i32_to_v4i32:
284 ; AVX512BWVL: # %bb.0:
285 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
286 ; AVX512BWVL-NEXT: vpmovqd %ymm0, (%rsi)
287 ; AVX512BWVL-NEXT: vzeroupper
288 ; AVX512BWVL-NEXT: retq
290 ; AVX512VBMIVL-LABEL: shuffle_v8i32_to_v4i32:
291 ; AVX512VBMIVL: # %bb.0:
292 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
293 ; AVX512VBMIVL-NEXT: vpmovqd %ymm0, (%rsi)
294 ; AVX512VBMIVL-NEXT: vzeroupper
295 ; AVX512VBMIVL-NEXT: retq
296 %vec = load <8 x i32>, <8 x i32>* %L
297 %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
298 store <4 x i32> %strided.vec, <4 x i32>* %S
302 define void @trunc_v4i64_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
303 ; AVX1-LABEL: trunc_v4i64_to_v4i32:
305 ; AVX1-NEXT: vmovaps (%rdi), %xmm0
306 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
307 ; AVX1-NEXT: vmovaps %xmm0, (%rsi)
310 ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i32:
311 ; AVX2-SLOW: # %bb.0:
312 ; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0
313 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
314 ; AVX2-SLOW-NEXT: vmovaps %xmm0, (%rsi)
315 ; AVX2-SLOW-NEXT: retq
317 ; AVX2-FAST-ALL-LABEL: trunc_v4i64_to_v4i32:
318 ; AVX2-FAST-ALL: # %bb.0:
319 ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm0 = <0,2,4,6,u,u,u,u>
320 ; AVX2-FAST-ALL-NEXT: vpermps (%rdi), %ymm0, %ymm0
321 ; AVX2-FAST-ALL-NEXT: vmovaps %xmm0, (%rsi)
322 ; AVX2-FAST-ALL-NEXT: vzeroupper
323 ; AVX2-FAST-ALL-NEXT: retq
325 ; AVX2-FAST-PERLANE-LABEL: trunc_v4i64_to_v4i32:
326 ; AVX2-FAST-PERLANE: # %bb.0:
327 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm0
328 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
329 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, (%rsi)
330 ; AVX2-FAST-PERLANE-NEXT: retq
332 ; AVX512F-LABEL: trunc_v4i64_to_v4i32:
334 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
335 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
336 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
337 ; AVX512F-NEXT: vzeroupper
340 ; AVX512VL-LABEL: trunc_v4i64_to_v4i32:
342 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
343 ; AVX512VL-NEXT: vpmovqd %ymm0, (%rsi)
344 ; AVX512VL-NEXT: vzeroupper
345 ; AVX512VL-NEXT: retq
347 ; AVX512BW-LABEL: trunc_v4i64_to_v4i32:
349 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
350 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
351 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
352 ; AVX512BW-NEXT: vzeroupper
353 ; AVX512BW-NEXT: retq
355 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i32:
356 ; AVX512BWVL: # %bb.0:
357 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
358 ; AVX512BWVL-NEXT: vpmovqd %ymm0, (%rsi)
359 ; AVX512BWVL-NEXT: vzeroupper
360 ; AVX512BWVL-NEXT: retq
362 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i32:
363 ; AVX512VBMIVL: # %bb.0:
364 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
365 ; AVX512VBMIVL-NEXT: vpmovqd %ymm0, (%rsi)
366 ; AVX512VBMIVL-NEXT: vzeroupper
367 ; AVX512VBMIVL-NEXT: retq
368 %vec = load <8 x i32>, <8 x i32>* %L
369 %bc = bitcast <8 x i32> %vec to <4 x i64>
370 %strided.vec = trunc <4 x i64> %bc to <4 x i32>
371 store <4 x i32> %strided.vec, <4 x i32>* %S
375 define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
376 ; AVX-LABEL: shuffle_v32i8_to_v8i8:
378 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
379 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
380 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
381 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
382 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
383 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
384 ; AVX-NEXT: vmovq %xmm0, (%rsi)
387 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8:
389 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
390 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
391 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
392 ; AVX512F-NEXT: vzeroupper
395 ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8:
397 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
398 ; AVX512VL-NEXT: vpmovdb %ymm0, (%rsi)
399 ; AVX512VL-NEXT: vzeroupper
400 ; AVX512VL-NEXT: retq
402 ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8:
404 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
405 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
406 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
407 ; AVX512BW-NEXT: vzeroupper
408 ; AVX512BW-NEXT: retq
410 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8:
411 ; AVX512BWVL: # %bb.0:
412 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
413 ; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rsi)
414 ; AVX512BWVL-NEXT: vzeroupper
415 ; AVX512BWVL-NEXT: retq
417 ; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v8i8:
418 ; AVX512VBMIVL: # %bb.0:
419 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
420 ; AVX512VBMIVL-NEXT: vpmovdb %ymm0, (%rsi)
421 ; AVX512VBMIVL-NEXT: vzeroupper
422 ; AVX512VBMIVL-NEXT: retq
423 %vec = load <32 x i8>, <32 x i8>* %L
424 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
425 store <8 x i8> %strided.vec, <8 x i8>* %S
429 define void @trunc_v8i32_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
430 ; AVX-LABEL: trunc_v8i32_to_v8i8:
432 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
433 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
434 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
435 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
436 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
437 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
438 ; AVX-NEXT: vmovq %xmm0, (%rsi)
441 ; AVX512F-LABEL: trunc_v8i32_to_v8i8:
443 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
444 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
445 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
446 ; AVX512F-NEXT: vzeroupper
449 ; AVX512VL-LABEL: trunc_v8i32_to_v8i8:
451 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
452 ; AVX512VL-NEXT: vpmovdb %ymm0, (%rsi)
453 ; AVX512VL-NEXT: vzeroupper
454 ; AVX512VL-NEXT: retq
456 ; AVX512BW-LABEL: trunc_v8i32_to_v8i8:
458 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
459 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
460 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
461 ; AVX512BW-NEXT: vzeroupper
462 ; AVX512BW-NEXT: retq
464 ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8:
465 ; AVX512BWVL: # %bb.0:
466 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
467 ; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rsi)
468 ; AVX512BWVL-NEXT: vzeroupper
469 ; AVX512BWVL-NEXT: retq
471 ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8:
472 ; AVX512VBMIVL: # %bb.0:
473 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
474 ; AVX512VBMIVL-NEXT: vpmovdb %ymm0, (%rsi)
475 ; AVX512VBMIVL-NEXT: vzeroupper
476 ; AVX512VBMIVL-NEXT: retq
477 %vec = load <32 x i8>, <32 x i8>* %L
478 %bc = bitcast <32 x i8> %vec to <8 x i32>
479 %strided.vec = trunc <8 x i32> %bc to <8 x i8>
480 store <8 x i8> %strided.vec, <8 x i8>* %S
484 define <2 x i64> @trunc_v8i32_to_v8i8_return_v2i64(<8 x i32> %vec) nounwind {
486 ; return (__m128i) {(long long)__builtin_convertvector((__v8si)__A, __v8qi), 0};
487 ; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
489 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
490 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
491 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
492 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
493 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
494 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
495 ; AVX1-NEXT: vzeroupper
498 ; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
500 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
501 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
502 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
503 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
504 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
505 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
506 ; AVX2-NEXT: vzeroupper
509 ; AVX512F-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
511 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
512 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
513 ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
514 ; AVX512F-NEXT: vzeroupper
517 ; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
519 ; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
520 ; AVX512VL-NEXT: vzeroupper
521 ; AVX512VL-NEXT: retq
523 ; AVX512BW-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
525 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
526 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
527 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
528 ; AVX512BW-NEXT: vzeroupper
529 ; AVX512BW-NEXT: retq
531 ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
532 ; AVX512BWVL: # %bb.0:
533 ; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
534 ; AVX512BWVL-NEXT: vzeroupper
535 ; AVX512BWVL-NEXT: retq
537 ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
538 ; AVX512VBMIVL: # %bb.0:
539 ; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0
540 ; AVX512VBMIVL-NEXT: vzeroupper
541 ; AVX512VBMIVL-NEXT: retq
542 %truncated.vec = trunc <8 x i32> %vec to <8 x i8>
543 %bc = bitcast <8 x i8> %truncated.vec to i64
544 %result = insertelement <2 x i64> zeroinitializer, i64 %bc, i32 0
545 ret <2 x i64> %result
548 define <16 x i8> @trunc_v8i32_to_v8i8_with_zext_return_v16i8(<8 x i32> %vec) nounwind {
549 ; AVX1-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
551 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
552 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
553 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
554 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
555 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
556 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
557 ; AVX1-NEXT: vzeroupper
560 ; AVX2-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
562 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
563 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
564 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
565 ; AVX2-NEXT: vzeroupper
568 ; AVX512F-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
570 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm0
571 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
572 ; AVX512F-NEXT: vzeroupper
575 ; AVX512VL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
577 ; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
578 ; AVX512VL-NEXT: vzeroupper
579 ; AVX512VL-NEXT: retq
581 ; AVX512BW-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
583 ; AVX512BW-NEXT: vmovdqa %ymm0, %ymm0
584 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
585 ; AVX512BW-NEXT: vzeroupper
586 ; AVX512BW-NEXT: retq
588 ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
589 ; AVX512BWVL: # %bb.0:
590 ; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
591 ; AVX512BWVL-NEXT: vzeroupper
592 ; AVX512BWVL-NEXT: retq
594 ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
595 ; AVX512VBMIVL: # %bb.0:
596 ; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0
597 ; AVX512VBMIVL-NEXT: vzeroupper
598 ; AVX512VBMIVL-NEXT: retq
599 %truncated = trunc <8 x i32> %vec to <8 x i8>
600 %truncated.ext = zext <8 x i8> %truncated to <8 x i16>
601 %bc = bitcast <8 x i16> %truncated.ext to <16 x i8>
602 %result = shufflevector <16 x i8> %bc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
603 ret <16 x i8> %result
606 define <16 x i8> @trunc_v8i32_to_v8i8_via_v8i16_return_v16i8(<8 x i32> %vec) nounwind {
607 ; AVX1-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
609 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
610 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
611 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
612 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
613 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
614 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
615 ; AVX1-NEXT: vzeroupper
618 ; AVX2-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
620 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
621 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
622 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
623 ; AVX2-NEXT: vzeroupper
626 ; AVX512F-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
628 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm0
629 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
630 ; AVX512F-NEXT: vzeroupper
633 ; AVX512VL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
635 ; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
636 ; AVX512VL-NEXT: vzeroupper
637 ; AVX512VL-NEXT: retq
639 ; AVX512BW-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
641 ; AVX512BW-NEXT: vmovdqa %ymm0, %ymm0
642 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
643 ; AVX512BW-NEXT: vzeroupper
644 ; AVX512BW-NEXT: retq
646 ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
647 ; AVX512BWVL: # %bb.0:
648 ; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
649 ; AVX512BWVL-NEXT: vzeroupper
650 ; AVX512BWVL-NEXT: retq
652 ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
653 ; AVX512VBMIVL: # %bb.0:
654 ; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0
655 ; AVX512VBMIVL-NEXT: vzeroupper
656 ; AVX512VBMIVL-NEXT: retq
657 %truncated = trunc <8 x i32> %vec to <8 x i16>
658 %bc = bitcast <8 x i16> %truncated to <16 x i8>
659 %result = shufflevector <16 x i8> %bc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 17, i32 20, i32 24, i32 22, i32 31, i32 28, i32 28, i32 29>
660 ret <16 x i8> %result
663 define <16 x i8> @trunc_v8i32_to_v8i8_return_v16i8(<8 x i32> %vec) nounwind {
664 ; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
666 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
667 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
668 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
669 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
670 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
671 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
672 ; AVX1-NEXT: vzeroupper
675 ; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
677 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
678 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
679 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
680 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
681 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
682 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
683 ; AVX2-NEXT: vzeroupper
686 ; AVX512F-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
688 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
689 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
690 ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
691 ; AVX512F-NEXT: vzeroupper
694 ; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
696 ; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
697 ; AVX512VL-NEXT: vzeroupper
698 ; AVX512VL-NEXT: retq
700 ; AVX512BW-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
702 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
703 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
704 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
705 ; AVX512BW-NEXT: vzeroupper
706 ; AVX512BW-NEXT: retq
708 ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
709 ; AVX512BWVL: # %bb.0:
710 ; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
711 ; AVX512BWVL-NEXT: vzeroupper
712 ; AVX512BWVL-NEXT: retq
714 ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
715 ; AVX512VBMIVL: # %bb.0:
716 ; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0
717 ; AVX512VBMIVL-NEXT: vzeroupper
718 ; AVX512VBMIVL-NEXT: retq
719 %truncated = trunc <8 x i32> %vec to <8 x i8>
720 %result = shufflevector <8 x i8> %truncated, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
721 ret <16 x i8> %result
724 define <2 x i64> @trunc_v4i64_to_v4i16_return_v2i64(<4 x i64> %vec) nounwind {
726 ; return (__m128i) {(long long)__builtin_convertvector((__v4di)x, __v4hi), 0};
727 ; AVX1-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
729 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
730 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
731 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
732 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
733 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
734 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
735 ; AVX1-NEXT: vzeroupper
738 ; AVX2-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
740 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
741 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
742 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
743 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
744 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
745 ; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
746 ; AVX2-NEXT: vzeroupper
749 ; AVX512F-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
751 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
752 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
753 ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
754 ; AVX512F-NEXT: vzeroupper
757 ; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
759 ; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0
760 ; AVX512VL-NEXT: vzeroupper
761 ; AVX512VL-NEXT: retq
763 ; AVX512BW-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
765 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
766 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
767 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
768 ; AVX512BW-NEXT: vzeroupper
769 ; AVX512BW-NEXT: retq
771 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
772 ; AVX512BWVL: # %bb.0:
773 ; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0
774 ; AVX512BWVL-NEXT: vzeroupper
775 ; AVX512BWVL-NEXT: retq
777 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
778 ; AVX512VBMIVL: # %bb.0:
779 ; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0
780 ; AVX512VBMIVL-NEXT: vzeroupper
781 ; AVX512VBMIVL-NEXT: retq
782 %truncated = trunc <4 x i64> %vec to <4 x i16>
783 %bc = bitcast <4 x i16> %truncated to i64
784 %result = insertelement <2 x i64> zeroinitializer, i64 %bc, i32 0
785 ret <2 x i64> %result
788 define <8 x i16> @trunc_v4i64_to_v4i16_with_zext_return_v8i16(<4 x i64> %vec) nounwind {
789 ; AVX1-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
791 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
792 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
793 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
794 ; AVX1-NEXT: vzeroupper
797 ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
798 ; AVX2-SLOW: # %bb.0:
799 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
800 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
801 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
802 ; AVX2-SLOW-NEXT: vzeroupper
803 ; AVX2-SLOW-NEXT: retq
805 ; AVX2-FAST-ALL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
806 ; AVX2-FAST-ALL: # %bb.0:
807 ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
808 ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
809 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
810 ; AVX2-FAST-ALL-NEXT: vzeroupper
811 ; AVX2-FAST-ALL-NEXT: retq
813 ; AVX2-FAST-PERLANE-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
814 ; AVX2-FAST-PERLANE: # %bb.0:
815 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
816 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
817 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
818 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
819 ; AVX2-FAST-PERLANE-NEXT: retq
821 ; AVX512F-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
823 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm0
824 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
825 ; AVX512F-NEXT: vzeroupper
828 ; AVX512VL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
830 ; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0
831 ; AVX512VL-NEXT: vzeroupper
832 ; AVX512VL-NEXT: retq
834 ; AVX512BW-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
836 ; AVX512BW-NEXT: vmovdqa %ymm0, %ymm0
837 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
838 ; AVX512BW-NEXT: vzeroupper
839 ; AVX512BW-NEXT: retq
841 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
842 ; AVX512BWVL: # %bb.0:
843 ; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0
844 ; AVX512BWVL-NEXT: vzeroupper
845 ; AVX512BWVL-NEXT: retq
847 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
848 ; AVX512VBMIVL: # %bb.0:
849 ; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0
850 ; AVX512VBMIVL-NEXT: vzeroupper
851 ; AVX512VBMIVL-NEXT: retq
852 %truncated = trunc <4 x i64> %vec to <4 x i16>
853 %truncated.ext = zext <4 x i16> %truncated to <4 x i32>
854 %bc = bitcast <4 x i32> %truncated.ext to <8 x i16>
855 %result = shufflevector <8 x i16> %bc, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
856 ret <8 x i16> %result
859 define <8 x i16> @trunc_v4i64_to_v4i16_via_v4i32_return_v8i16(<4 x i64> %vec) nounwind {
860 ; AVX1-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
862 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
863 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
864 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
865 ; AVX1-NEXT: vzeroupper
868 ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
869 ; AVX2-SLOW: # %bb.0:
870 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
871 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
872 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
873 ; AVX2-SLOW-NEXT: vzeroupper
874 ; AVX2-SLOW-NEXT: retq
876 ; AVX2-FAST-ALL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
877 ; AVX2-FAST-ALL: # %bb.0:
878 ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
879 ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
880 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
881 ; AVX2-FAST-ALL-NEXT: vzeroupper
882 ; AVX2-FAST-ALL-NEXT: retq
884 ; AVX2-FAST-PERLANE-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
885 ; AVX2-FAST-PERLANE: # %bb.0:
886 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
887 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
888 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
889 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
890 ; AVX2-FAST-PERLANE-NEXT: retq
892 ; AVX512F-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
894 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm0
895 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
896 ; AVX512F-NEXT: vzeroupper
899 ; AVX512VL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
901 ; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0
902 ; AVX512VL-NEXT: vzeroupper
903 ; AVX512VL-NEXT: retq
905 ; AVX512BW-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
907 ; AVX512BW-NEXT: vmovdqa %ymm0, %ymm0
908 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
909 ; AVX512BW-NEXT: vzeroupper
910 ; AVX512BW-NEXT: retq
912 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
913 ; AVX512BWVL: # %bb.0:
914 ; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0
915 ; AVX512BWVL-NEXT: vzeroupper
916 ; AVX512BWVL-NEXT: retq
918 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
919 ; AVX512VBMIVL: # %bb.0:
920 ; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0
921 ; AVX512VBMIVL-NEXT: vzeroupper
922 ; AVX512VBMIVL-NEXT: retq
923 %truncated = trunc <4 x i64> %vec to <4 x i32>
924 %bc = bitcast <4 x i32> %truncated to <8 x i16>
925 %result = shufflevector <8 x i16> %bc, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 8, i32 undef, i32 13>
926 ret <8 x i16> %result
929 define <8 x i16> @trunc_v4i64_to_v4i16_return_v8i16(<4 x i64> %vec) nounwind {
930 ; AVX1-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
932 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
933 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
934 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
935 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
936 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
937 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
938 ; AVX1-NEXT: vzeroupper
941 ; AVX2-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
943 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
944 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
945 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
946 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
947 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
948 ; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
949 ; AVX2-NEXT: vzeroupper
952 ; AVX512F-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
954 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
955 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
956 ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
957 ; AVX512F-NEXT: vzeroupper
960 ; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
962 ; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0
963 ; AVX512VL-NEXT: vzeroupper
964 ; AVX512VL-NEXT: retq
966 ; AVX512BW-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
968 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
969 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
970 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
971 ; AVX512BW-NEXT: vzeroupper
972 ; AVX512BW-NEXT: retq
974 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
975 ; AVX512BWVL: # %bb.0:
976 ; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0
977 ; AVX512BWVL-NEXT: vzeroupper
978 ; AVX512BWVL-NEXT: retq
980 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
981 ; AVX512VBMIVL: # %bb.0:
982 ; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0
983 ; AVX512VBMIVL-NEXT: vzeroupper
984 ; AVX512VBMIVL-NEXT: retq
985 %truncated = trunc <4 x i64> %vec to <4 x i16>
986 %result = shufflevector <4 x i16> %truncated, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
987 ret <8 x i16> %result
990 define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind {
991 ; AVX1-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
993 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
994 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
995 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
996 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
997 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
998 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
999 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1000 ; AVX1-NEXT: vzeroupper
1003 ; AVX2-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
1005 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1006 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1007 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1008 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1009 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1010 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1011 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1012 ; AVX2-NEXT: vzeroupper
1015 ; AVX512F-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
1017 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1018 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
1019 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
1020 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1021 ; AVX512F-NEXT: vzeroupper
1022 ; AVX512F-NEXT: retq
1024 ; AVX512VL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
1025 ; AVX512VL: # %bb.0:
1026 ; AVX512VL-NEXT: vpmovqb %ymm0, %xmm0
1027 ; AVX512VL-NEXT: vzeroupper
1028 ; AVX512VL-NEXT: retq
1030 ; AVX512BW-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
1031 ; AVX512BW: # %bb.0:
1032 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1033 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
1034 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
1035 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1036 ; AVX512BW-NEXT: vzeroupper
1037 ; AVX512BW-NEXT: retq
1039 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
1040 ; AVX512BWVL: # %bb.0:
1041 ; AVX512BWVL-NEXT: vpmovqb %ymm0, %xmm0
1042 ; AVX512BWVL-NEXT: vzeroupper
1043 ; AVX512BWVL-NEXT: retq
1045 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
1046 ; AVX512VBMIVL: # %bb.0:
1047 ; AVX512VBMIVL-NEXT: vpmovqb %ymm0, %xmm0
1048 ; AVX512VBMIVL-NEXT: vzeroupper
1049 ; AVX512VBMIVL-NEXT: retq
1050 %truncated = trunc <4 x i64> %vec to <4 x i8>
1051 %result = shufflevector <4 x i8> %truncated, <4 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 5, i32 5, i32 undef, i32 7>
1052 ret <16 x i8> %result
1055 define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
1056 ; AVX-LABEL: shuffle_v16i16_to_v4i16:
1058 ; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
1059 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
1060 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
1061 ; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1062 ; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
1063 ; AVX-NEXT: vmovq %xmm0, (%rsi)
1066 ; AVX512F-LABEL: shuffle_v16i16_to_v4i16:
1068 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
1069 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
1070 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
1071 ; AVX512F-NEXT: vzeroupper
1072 ; AVX512F-NEXT: retq
1074 ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16:
1075 ; AVX512VL: # %bb.0:
1076 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
1077 ; AVX512VL-NEXT: vpmovqw %ymm0, (%rsi)
1078 ; AVX512VL-NEXT: vzeroupper
1079 ; AVX512VL-NEXT: retq
1081 ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16:
1082 ; AVX512BW: # %bb.0:
1083 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
1084 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
1085 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
1086 ; AVX512BW-NEXT: vzeroupper
1087 ; AVX512BW-NEXT: retq
1089 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16:
1090 ; AVX512BWVL: # %bb.0:
1091 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
1092 ; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rsi)
1093 ; AVX512BWVL-NEXT: vzeroupper
1094 ; AVX512BWVL-NEXT: retq
1096 ; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v4i16:
1097 ; AVX512VBMIVL: # %bb.0:
1098 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
1099 ; AVX512VBMIVL-NEXT: vpmovqw %ymm0, (%rsi)
1100 ; AVX512VBMIVL-NEXT: vzeroupper
1101 ; AVX512VBMIVL-NEXT: retq
1102 %vec = load <16 x i16>, <16 x i16>* %L
1103 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
1104 store <4 x i16> %strided.vec, <4 x i16>* %S
1108 define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
1109 ; AVX-LABEL: trunc_v4i64_to_v4i16:
1111 ; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
1112 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
1113 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
1114 ; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1115 ; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
1116 ; AVX-NEXT: vmovq %xmm0, (%rsi)
1119 ; AVX512F-LABEL: trunc_v4i64_to_v4i16:
1121 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
1122 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
1123 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
1124 ; AVX512F-NEXT: vzeroupper
1125 ; AVX512F-NEXT: retq
1127 ; AVX512VL-LABEL: trunc_v4i64_to_v4i16:
1128 ; AVX512VL: # %bb.0:
1129 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
1130 ; AVX512VL-NEXT: vpmovqw %ymm0, (%rsi)
1131 ; AVX512VL-NEXT: vzeroupper
1132 ; AVX512VL-NEXT: retq
1134 ; AVX512BW-LABEL: trunc_v4i64_to_v4i16:
1135 ; AVX512BW: # %bb.0:
1136 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
1137 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
1138 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
1139 ; AVX512BW-NEXT: vzeroupper
1140 ; AVX512BW-NEXT: retq
1142 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16:
1143 ; AVX512BWVL: # %bb.0:
1144 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
1145 ; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rsi)
1146 ; AVX512BWVL-NEXT: vzeroupper
1147 ; AVX512BWVL-NEXT: retq
1149 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16:
1150 ; AVX512VBMIVL: # %bb.0:
1151 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
1152 ; AVX512VBMIVL-NEXT: vpmovqw %ymm0, (%rsi)
1153 ; AVX512VBMIVL-NEXT: vzeroupper
1154 ; AVX512VBMIVL-NEXT: retq
1155 %vec = load <16 x i16>, <16 x i16>* %L
1156 %bc = bitcast <16 x i16> %vec to <4 x i64>
1157 %strided.vec = trunc <4 x i64> %bc to <4 x i16>
1158 store <4 x i16> %strided.vec, <4 x i16>* %S
1162 define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
1163 ; AVX-LABEL: shuffle_v32i8_to_v4i8:
1165 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1166 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
1167 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1168 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1169 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1170 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1171 ; AVX-NEXT: vmovd %xmm0, (%rsi)
1174 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8:
1176 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
1177 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
1178 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
1179 ; AVX512F-NEXT: vzeroupper
1180 ; AVX512F-NEXT: retq
1182 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8:
1183 ; AVX512VL: # %bb.0:
1184 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
1185 ; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi)
1186 ; AVX512VL-NEXT: vzeroupper
1187 ; AVX512VL-NEXT: retq
1189 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8:
1190 ; AVX512BW: # %bb.0:
1191 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
1192 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
1193 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
1194 ; AVX512BW-NEXT: vzeroupper
1195 ; AVX512BW-NEXT: retq
1197 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8:
1198 ; AVX512BWVL: # %bb.0:
1199 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
1200 ; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi)
1201 ; AVX512BWVL-NEXT: vzeroupper
1202 ; AVX512BWVL-NEXT: retq
1204 ; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v4i8:
1205 ; AVX512VBMIVL: # %bb.0:
1206 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
1207 ; AVX512VBMIVL-NEXT: vpmovqb %ymm0, (%rsi)
1208 ; AVX512VBMIVL-NEXT: vzeroupper
1209 ; AVX512VBMIVL-NEXT: retq
1210 %vec = load <32 x i8>, <32 x i8>* %L
1211 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
1212 store <4 x i8> %strided.vec, <4 x i8>* %S
1216 define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
1217 ; AVX-LABEL: trunc_v4i64_to_v4i8:
1219 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1220 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
1221 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1222 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1223 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1224 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1225 ; AVX-NEXT: vmovd %xmm0, (%rsi)
1228 ; AVX512F-LABEL: trunc_v4i64_to_v4i8:
1230 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
1231 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
1232 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
1233 ; AVX512F-NEXT: vzeroupper
1234 ; AVX512F-NEXT: retq
1236 ; AVX512VL-LABEL: trunc_v4i64_to_v4i8:
1237 ; AVX512VL: # %bb.0:
1238 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
1239 ; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi)
1240 ; AVX512VL-NEXT: vzeroupper
1241 ; AVX512VL-NEXT: retq
1243 ; AVX512BW-LABEL: trunc_v4i64_to_v4i8:
1244 ; AVX512BW: # %bb.0:
1245 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
1246 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
1247 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
1248 ; AVX512BW-NEXT: vzeroupper
1249 ; AVX512BW-NEXT: retq
1251 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8:
1252 ; AVX512BWVL: # %bb.0:
1253 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
1254 ; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi)
1255 ; AVX512BWVL-NEXT: vzeroupper
1256 ; AVX512BWVL-NEXT: retq
1258 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i8:
1259 ; AVX512VBMIVL: # %bb.0:
1260 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
1261 ; AVX512VBMIVL-NEXT: vpmovqb %ymm0, (%rsi)
1262 ; AVX512VBMIVL-NEXT: vzeroupper
1263 ; AVX512VBMIVL-NEXT: retq
1264 %vec = load <32 x i8>, <32 x i8>* %L
1265 %bc = bitcast <32 x i8> %vec to <4 x i64>
1266 %strided.vec = trunc <4 x i64> %bc to <4 x i8>
1267 store <4 x i8> %strided.vec, <4 x i8>* %S
1271 ; In this case not all elements are collected from the same source vector, so
1272 ; the resulting BUILD_VECTOR should not be combined to a truncate.
1273 define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind {
1274 ; AVX1-LABEL: negative:
1276 ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
1277 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1278 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u],zero,zero,zero,zero,zero,zero,zero,xmm0[0,2,4,6,8,10,12,14]
1279 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1280 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1281 ; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
1282 ; AVX1-NEXT: vzeroupper
1285 ; AVX2-LABEL: negative:
1287 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
1288 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1289 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
1290 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1291 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1292 ; AVX2-NEXT: vzeroupper
1295 ; AVX512F-LABEL: negative:
1297 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
1298 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1299 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
1300 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1301 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1302 ; AVX512F-NEXT: vzeroupper
1303 ; AVX512F-NEXT: retq
1305 ; AVX512VL-LABEL: negative:
1306 ; AVX512VL: # %bb.0:
1307 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm0[2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
1308 ; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1309 ; AVX512VL-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
1310 ; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1311 ; AVX512VL-NEXT: vzeroupper
1312 ; AVX512VL-NEXT: retq
1314 ; AVX512BW-LABEL: negative:
1315 ; AVX512BW: # %bb.0:
1316 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
1317 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1318 ; AVX512BW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
1319 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1320 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1321 ; AVX512BW-NEXT: vzeroupper
1322 ; AVX512BW-NEXT: retq
1324 ; AVX512BWVL-LABEL: negative:
1325 ; AVX512BWVL: # %bb.0:
1326 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
1327 ; AVX512BWVL-NEXT: movl $65537, %eax # imm = 0x10001
1328 ; AVX512BWVL-NEXT: kmovd %eax, %k1
1329 ; AVX512BWVL-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
1330 ; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1331 ; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1332 ; AVX512BWVL-NEXT: vzeroupper
1333 ; AVX512BWVL-NEXT: retq
1335 ; AVX512VBMIVL-LABEL: negative:
1336 ; AVX512VBMIVL: # %bb.0:
1337 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm2 = <32,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1338 ; AVX512VBMIVL-NEXT: vpermt2b %ymm1, %ymm2, %ymm0
1339 ; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1340 ; AVX512VBMIVL-NEXT: vzeroupper
1341 ; AVX512VBMIVL-NEXT: retq
1342 %strided.vec = shufflevector <32 x i8> %v, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
1343 %w0 = extractelement <32 x i8> %w, i32 0
1344 %merged = insertelement <16 x i8> %strided.vec, i8 %w0, i32 0
1345 ret <16 x i8> %merged