1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
3 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
4 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
5 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
6 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
7 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMIVL
10 ; Pairs of shufflevector:trunc functions with functional equivalence.
11 ; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
13 define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
14 ; AVX512F-LABEL: shuffle_v64i8_to_v32i8:
16 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
17 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
18 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
19 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
20 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
21 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
22 ; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
23 ; AVX512F-NEXT: vzeroupper
26 ; AVX512VL-LABEL: shuffle_v64i8_to_v32i8:
28 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
29 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
30 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
31 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
32 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
33 ; AVX512VL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
34 ; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi)
35 ; AVX512VL-NEXT: vzeroupper
38 ; AVX512BW-LABEL: shuffle_v64i8_to_v32i8:
40 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
41 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1
42 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
43 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
44 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
45 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
46 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
47 ; AVX512BW-NEXT: vzeroupper
50 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8:
51 ; AVX512BWVL: # %bb.0:
52 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
53 ; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %ymm1
54 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
55 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
56 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
57 ; AVX512BWVL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
58 ; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi)
59 ; AVX512BWVL-NEXT: vzeroupper
60 ; AVX512BWVL-NEXT: retq
62 ; AVX512VBMI-LABEL: shuffle_v64i8_to_v32i8:
63 ; AVX512VBMI: # %bb.0:
64 ; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm0
65 ; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %ymm1
66 ; AVX512VBMI-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
67 ; AVX512VBMI-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
68 ; AVX512VBMI-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
69 ; AVX512VBMI-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
70 ; AVX512VBMI-NEXT: vmovdqa %ymm0, (%rsi)
71 ; AVX512VBMI-NEXT: vzeroupper
72 ; AVX512VBMI-NEXT: retq
74 ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v32i8:
75 ; AVX512VBMIVL: # %bb.0:
76 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
77 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62]
78 ; AVX512VBMIVL-NEXT: vpermi2b 32(%rdi), %ymm0, %ymm1
79 ; AVX512VBMIVL-NEXT: vmovdqa %ymm1, (%rsi)
80 ; AVX512VBMIVL-NEXT: vzeroupper
81 ; AVX512VBMIVL-NEXT: retq
82 %vec = load <64 x i8>, <64 x i8>* %L
83 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
84 store <32 x i8> %strided.vec, <32 x i8>* %S
88 define void @trunc_v32i16_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
89 ; AVX512F-LABEL: trunc_v32i16_to_v32i8:
91 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
92 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
93 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
94 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
95 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
96 ; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
97 ; AVX512F-NEXT: vzeroupper
100 ; AVX512VL-LABEL: trunc_v32i16_to_v32i8:
102 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
103 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
104 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
105 ; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1
106 ; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
107 ; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi)
108 ; AVX512VL-NEXT: vzeroupper
109 ; AVX512VL-NEXT: retq
111 ; AVX512BW-LABEL: trunc_v32i16_to_v32i8:
113 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
114 ; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi)
115 ; AVX512BW-NEXT: vzeroupper
116 ; AVX512BW-NEXT: retq
118 ; AVX512BWVL-LABEL: trunc_v32i16_to_v32i8:
119 ; AVX512BWVL: # %bb.0:
120 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
121 ; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi)
122 ; AVX512BWVL-NEXT: vzeroupper
123 ; AVX512BWVL-NEXT: retq
125 ; AVX512VBMI-LABEL: trunc_v32i16_to_v32i8:
126 ; AVX512VBMI: # %bb.0:
127 ; AVX512VBMI-NEXT: vmovdqa64 (%rdi), %zmm0
128 ; AVX512VBMI-NEXT: vpmovwb %zmm0, (%rsi)
129 ; AVX512VBMI-NEXT: vzeroupper
130 ; AVX512VBMI-NEXT: retq
132 ; AVX512VBMIVL-LABEL: trunc_v32i16_to_v32i8:
133 ; AVX512VBMIVL: # %bb.0:
134 ; AVX512VBMIVL-NEXT: vmovdqa64 (%rdi), %zmm0
135 ; AVX512VBMIVL-NEXT: vpmovwb %zmm0, (%rsi)
136 ; AVX512VBMIVL-NEXT: vzeroupper
137 ; AVX512VBMIVL-NEXT: retq
138 %vec = load <64 x i8>, <64 x i8>* %L
139 %bc = bitcast <64 x i8> %vec to <32 x i16>
140 %strided.vec = trunc <32 x i16> %bc to <32 x i8>
141 store <32 x i8> %strided.vec, <32 x i8>* %S
145 define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
146 ; AVX512F-LABEL: shuffle_v32i16_to_v16i16:
148 ; AVX512F-NEXT: vpshuflw {{.*#+}} ymm0 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
149 ; AVX512F-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
150 ; AVX512F-NEXT: vpshuflw {{.*#+}} ymm1 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
151 ; AVX512F-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
152 ; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
153 ; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
154 ; AVX512F-NEXT: vmovaps %ymm0, (%rsi)
155 ; AVX512F-NEXT: vzeroupper
158 ; AVX512VL-LABEL: shuffle_v32i16_to_v16i16:
160 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
161 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
162 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31]
163 ; AVX512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1
164 ; AVX512VL-NEXT: vpshufb %ymm2, %ymm0, %ymm0
165 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14]
166 ; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
167 ; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi)
168 ; AVX512VL-NEXT: vzeroupper
169 ; AVX512VL-NEXT: retq
171 ; AVX512BW-LABEL: shuffle_v32i16_to_v16i16:
173 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
174 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1
175 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31]
176 ; AVX512BW-NEXT: vpshufb %ymm2, %ymm1, %ymm1
177 ; AVX512BW-NEXT: vpshufb %ymm2, %ymm0, %ymm0
178 ; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
179 ; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
180 ; AVX512BW-NEXT: vmovaps %ymm0, (%rsi)
181 ; AVX512BW-NEXT: vzeroupper
182 ; AVX512BW-NEXT: retq
184 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16:
185 ; AVX512BWVL: # %bb.0:
186 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
187 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
188 ; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
189 ; AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi)
190 ; AVX512BWVL-NEXT: vzeroupper
191 ; AVX512BWVL-NEXT: retq
193 ; AVX512VBMI-LABEL: shuffle_v32i16_to_v16i16:
194 ; AVX512VBMI: # %bb.0:
195 ; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm0
196 ; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %ymm1
197 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31]
198 ; AVX512VBMI-NEXT: vpshufb %ymm2, %ymm1, %ymm1
199 ; AVX512VBMI-NEXT: vpshufb %ymm2, %ymm0, %ymm0
200 ; AVX512VBMI-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
201 ; AVX512VBMI-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
202 ; AVX512VBMI-NEXT: vmovaps %ymm0, (%rsi)
203 ; AVX512VBMI-NEXT: vzeroupper
204 ; AVX512VBMI-NEXT: retq
206 ; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v16i16:
207 ; AVX512VBMIVL: # %bb.0:
208 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
209 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
210 ; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
211 ; AVX512VBMIVL-NEXT: vmovdqa %ymm1, (%rsi)
212 ; AVX512VBMIVL-NEXT: vzeroupper
213 ; AVX512VBMIVL-NEXT: retq
214 %vec = load <32 x i16>, <32 x i16>* %L
215 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
216 store <16 x i16> %strided.vec, <16 x i16>* %S
220 define void @trunc_v16i32_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
221 ; AVX512-LABEL: trunc_v16i32_to_v16i16:
223 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
224 ; AVX512-NEXT: vpmovdw %zmm0, (%rsi)
225 ; AVX512-NEXT: vzeroupper
227 %vec = load <32 x i16>, <32 x i16>* %L
228 %bc = bitcast <32 x i16> %vec to <16 x i32>
229 %strided.vec = trunc <16 x i32> %bc to <16 x i16>
230 store <16 x i16> %strided.vec, <16 x i16>* %S
234 define void @shuffle_v16i32_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
235 ; AVX512F-LABEL: shuffle_v16i32_to_v8i32:
237 ; AVX512F-NEXT: vmovaps (%rdi), %ymm0
238 ; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6]
239 ; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
240 ; AVX512F-NEXT: vmovaps %ymm0, (%rsi)
241 ; AVX512F-NEXT: vzeroupper
244 ; AVX512VL-LABEL: shuffle_v16i32_to_v8i32:
246 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
247 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14]
248 ; AVX512VL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1
249 ; AVX512VL-NEXT: vmovdqa %ymm1, (%rsi)
250 ; AVX512VL-NEXT: vzeroupper
251 ; AVX512VL-NEXT: retq
253 ; AVX512BW-LABEL: shuffle_v16i32_to_v8i32:
255 ; AVX512BW-NEXT: vmovaps (%rdi), %ymm0
256 ; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6]
257 ; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
258 ; AVX512BW-NEXT: vmovaps %ymm0, (%rsi)
259 ; AVX512BW-NEXT: vzeroupper
260 ; AVX512BW-NEXT: retq
262 ; AVX512BWVL-LABEL: shuffle_v16i32_to_v8i32:
263 ; AVX512BWVL: # %bb.0:
264 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
265 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14]
266 ; AVX512BWVL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1
267 ; AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi)
268 ; AVX512BWVL-NEXT: vzeroupper
269 ; AVX512BWVL-NEXT: retq
271 ; AVX512VBMI-LABEL: shuffle_v16i32_to_v8i32:
272 ; AVX512VBMI: # %bb.0:
273 ; AVX512VBMI-NEXT: vmovaps (%rdi), %ymm0
274 ; AVX512VBMI-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6]
275 ; AVX512VBMI-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
276 ; AVX512VBMI-NEXT: vmovaps %ymm0, (%rsi)
277 ; AVX512VBMI-NEXT: vzeroupper
278 ; AVX512VBMI-NEXT: retq
280 ; AVX512VBMIVL-LABEL: shuffle_v16i32_to_v8i32:
281 ; AVX512VBMIVL: # %bb.0:
282 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
283 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14]
284 ; AVX512VBMIVL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1
285 ; AVX512VBMIVL-NEXT: vmovdqa %ymm1, (%rsi)
286 ; AVX512VBMIVL-NEXT: vzeroupper
287 ; AVX512VBMIVL-NEXT: retq
288 %vec = load <16 x i32>, <16 x i32>* %L
289 %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
290 store <8 x i32> %strided.vec, <8 x i32>* %S
294 define void @trunc_v8i64_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
295 ; AVX512-LABEL: trunc_v8i64_to_v8i32:
297 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
298 ; AVX512-NEXT: vpmovqd %zmm0, (%rsi)
299 ; AVX512-NEXT: vzeroupper
301 %vec = load <16 x i32>, <16 x i32>* %L
302 %bc = bitcast <16 x i32> %vec to <8 x i64>
303 %strided.vec = trunc <8 x i64> %bc to <8 x i32>
304 store <8 x i32> %strided.vec, <8 x i32>* %S
308 define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
309 ; AVX512F-LABEL: shuffle_v64i8_to_v16i8:
311 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
312 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
313 ; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
314 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
315 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
316 ; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
317 ; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
318 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
319 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
320 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
321 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
322 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
323 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
324 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
327 ; AVX512VL-LABEL: shuffle_v64i8_to_v16i8:
329 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
330 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
331 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
332 ; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
333 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
334 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
335 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
336 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
337 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
338 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
339 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
340 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
341 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
342 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
343 ; AVX512VL-NEXT: retq
345 ; AVX512BW-LABEL: shuffle_v64i8_to_v16i8:
347 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
348 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
349 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
350 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
351 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
352 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
353 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
354 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
355 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
356 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
357 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
358 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
359 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
360 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
361 ; AVX512BW-NEXT: retq
363 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8:
364 ; AVX512BWVL: # %bb.0:
365 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
366 ; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
367 ; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2
368 ; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3
369 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
370 ; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
371 ; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
372 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
373 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
374 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
375 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
376 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
377 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
378 ; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi)
379 ; AVX512BWVL-NEXT: retq
381 ; AVX512VBMI-LABEL: shuffle_v64i8_to_v16i8:
382 ; AVX512VBMI: # %bb.0:
383 ; AVX512VBMI-NEXT: vmovdqa (%rdi), %xmm0
384 ; AVX512VBMI-NEXT: vmovdqa 16(%rdi), %xmm1
385 ; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2
386 ; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm3
387 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
388 ; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm3, %xmm3
389 ; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm2, %xmm2
390 ; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
391 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
392 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1
393 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0
394 ; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
395 ; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
396 ; AVX512VBMI-NEXT: vmovdqa %xmm0, (%rsi)
397 ; AVX512VBMI-NEXT: retq
399 ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v16i8:
400 ; AVX512VBMIVL: # %bb.0:
401 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
402 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
403 ; AVX512VBMIVL-NEXT: vpermi2b 32(%rdi), %ymm0, %ymm1
404 ; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
405 ; AVX512VBMIVL-NEXT: vzeroupper
406 ; AVX512VBMIVL-NEXT: retq
407 %vec = load <64 x i8>, <64 x i8>* %L
408 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
409 store <16 x i8> %strided.vec, <16 x i8>* %S
413 define void @trunc_v16i32_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
414 ; AVX512-LABEL: trunc_v16i32_to_v16i8:
416 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
417 ; AVX512-NEXT: vpmovdb %zmm0, (%rsi)
418 ; AVX512-NEXT: vzeroupper
420 %vec = load <64 x i8>, <64 x i8>* %L
421 %bc = bitcast <64 x i8> %vec to <16 x i32>
422 %strided.vec = trunc <16 x i32> %bc to <16 x i8>
423 store <16 x i8> %strided.vec, <16 x i8>* %S
427 define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
428 ; AVX512F-LABEL: shuffle_v32i16_to_v8i16:
430 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
431 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
432 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
433 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
434 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
435 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
436 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
437 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = mem[0,2,2,3]
438 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
439 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
440 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
441 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
444 ; AVX512VL-LABEL: shuffle_v32i16_to_v8i16:
446 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
447 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
448 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
449 ; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
450 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
451 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
452 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
453 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
454 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
455 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
456 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
457 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
458 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
459 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
460 ; AVX512VL-NEXT: retq
462 ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16:
464 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
465 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
466 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
467 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
468 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
469 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
470 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
471 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
472 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
473 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
474 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
475 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
476 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
477 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
478 ; AVX512BW-NEXT: retq
480 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16:
481 ; AVX512BWVL: # %bb.0:
482 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
483 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u>
484 ; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
485 ; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi)
486 ; AVX512BWVL-NEXT: vzeroupper
487 ; AVX512BWVL-NEXT: retq
489 ; AVX512VBMI-LABEL: shuffle_v32i16_to_v8i16:
490 ; AVX512VBMI: # %bb.0:
491 ; AVX512VBMI-NEXT: vmovdqa (%rdi), %xmm0
492 ; AVX512VBMI-NEXT: vmovdqa 16(%rdi), %xmm1
493 ; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2
494 ; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm3
495 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
496 ; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm3, %xmm3
497 ; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm2, %xmm2
498 ; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
499 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
500 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1
501 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0
502 ; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
503 ; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
504 ; AVX512VBMI-NEXT: vmovdqa %xmm0, (%rsi)
505 ; AVX512VBMI-NEXT: retq
507 ; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v8i16:
508 ; AVX512VBMIVL: # %bb.0:
509 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
510 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u>
511 ; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
512 ; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
513 ; AVX512VBMIVL-NEXT: vzeroupper
514 ; AVX512VBMIVL-NEXT: retq
515 %vec = load <32 x i16>, <32 x i16>* %L
516 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
517 store <8 x i16> %strided.vec, <8 x i16>* %S
521 define void @trunc_v8i64_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
522 ; AVX512-LABEL: trunc_v8i64_to_v8i16:
524 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
525 ; AVX512-NEXT: vpmovqw %zmm0, (%rsi)
526 ; AVX512-NEXT: vzeroupper
528 %vec = load <32 x i16>, <32 x i16>* %L
529 %bc = bitcast <32 x i16> %vec to <8 x i64>
530 %strided.vec = trunc <8 x i64> %bc to <8 x i16>
531 store <8 x i16> %strided.vec, <8 x i16>* %S
535 define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
536 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8:
538 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
539 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
540 ; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
541 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
542 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
543 ; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
544 ; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
545 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
546 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
547 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
548 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
549 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
550 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
551 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
554 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8:
556 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
557 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
558 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
559 ; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
560 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
561 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
562 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
563 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
564 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
565 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
566 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
567 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
568 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
569 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
570 ; AVX512VL-NEXT: retq
572 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8:
574 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
575 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
576 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
577 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
578 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
579 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
580 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
581 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
582 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
583 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
584 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
585 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
586 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
587 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
588 ; AVX512BW-NEXT: retq
590 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8:
591 ; AVX512BWVL: # %bb.0:
592 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
593 ; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
594 ; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2
595 ; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3
596 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
597 ; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
598 ; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
599 ; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
600 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
601 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
602 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
603 ; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
604 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
605 ; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi)
606 ; AVX512BWVL-NEXT: retq
608 ; AVX512VBMI-LABEL: shuffle_v64i8_to_v8i8:
609 ; AVX512VBMI: # %bb.0:
610 ; AVX512VBMI-NEXT: vmovdqa (%rdi), %xmm0
611 ; AVX512VBMI-NEXT: vmovdqa 16(%rdi), %xmm1
612 ; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2
613 ; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm3
614 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
615 ; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm3, %xmm3
616 ; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm2, %xmm2
617 ; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
618 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
619 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1
620 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0
621 ; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
622 ; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
623 ; AVX512VBMI-NEXT: vmovq %xmm0, (%rsi)
624 ; AVX512VBMI-NEXT: retq
626 ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v8i8:
627 ; AVX512VBMIVL: # %bb.0:
628 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
629 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,8,16,24,32,40,48,56,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
630 ; AVX512VBMIVL-NEXT: vpermi2b 32(%rdi), %ymm0, %ymm1
631 ; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi)
632 ; AVX512VBMIVL-NEXT: vzeroupper
633 ; AVX512VBMIVL-NEXT: retq
634 %vec = load <64 x i8>, <64 x i8>* %L
635 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
636 store <8 x i8> %strided.vec, <8 x i8>* %S
640 define void @trunc_v8i64_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
641 ; AVX512-LABEL: trunc_v8i64_to_v8i8:
643 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
644 ; AVX512-NEXT: vpmovqb %zmm0, (%rsi)
645 ; AVX512-NEXT: vzeroupper
647 %vec = load <64 x i8>, <64 x i8>* %L
648 %bc = bitcast <64 x i8> %vec to <8 x i64>
649 %strided.vec = trunc <8 x i64> %bc to <8 x i8>
650 store <8 x i8> %strided.vec, <8 x i8>* %S
654 define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61(<64 x i8> %x) {
655 ; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
657 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
658 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
659 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
660 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
661 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
662 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
663 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
664 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
665 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
666 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
667 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
668 ; AVX512F-NEXT: vzeroupper
671 ; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
673 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
674 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
675 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
676 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
677 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
678 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
679 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
680 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
681 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
682 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
683 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
684 ; AVX512VL-NEXT: vzeroupper
685 ; AVX512VL-NEXT: retq
687 ; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
689 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
690 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
691 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
692 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
693 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
694 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
695 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
696 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
697 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
698 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
699 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
700 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
701 ; AVX512BW-NEXT: vzeroupper
702 ; AVX512BW-NEXT: retq
704 ; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
705 ; AVX512BWVL: # %bb.0:
706 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
707 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
708 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
709 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
710 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
711 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
712 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
713 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
714 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
715 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
716 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
717 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
718 ; AVX512BWVL-NEXT: vzeroupper
719 ; AVX512BWVL-NEXT: retq
721 ; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
722 ; AVX512VBMI: # %bb.0:
723 ; AVX512VBMI-NEXT: vextracti64x4 $1, %zmm0, %ymm1
724 ; AVX512VBMI-NEXT: vextracti128 $1, %ymm1, %xmm2
725 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
726 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm2, %xmm2
727 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1
728 ; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
729 ; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm2
730 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
731 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm2, %xmm2
732 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0
733 ; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
734 ; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
735 ; AVX512VBMI-NEXT: vzeroupper
736 ; AVX512VBMI-NEXT: retq
738 ; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
739 ; AVX512VBMIVL: # %bb.0:
740 ; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
741 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
742 ; AVX512VBMIVL-NEXT: vpermi2b %ymm2, %ymm0, %ymm1
743 ; AVX512VBMIVL-NEXT: vmovdqa %xmm1, %xmm0
744 ; AVX512VBMIVL-NEXT: vzeroupper
745 ; AVX512VBMIVL-NEXT: retq
746 %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
750 define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62(<64 x i8> %x) {
751 ; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
753 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
754 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
755 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
756 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
757 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
758 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
759 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
760 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
761 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
762 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
763 ; AVX512F-NEXT: vzeroupper
766 ; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
768 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
769 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
770 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
771 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
772 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
773 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
774 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
775 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
776 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
777 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
778 ; AVX512VL-NEXT: vzeroupper
779 ; AVX512VL-NEXT: retq
781 ; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
783 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
784 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
785 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
786 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm2
787 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
788 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0
789 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
790 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
791 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
792 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
793 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
794 ; AVX512BW-NEXT: vzeroupper
795 ; AVX512BW-NEXT: retq
797 ; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
798 ; AVX512BWVL: # %bb.0:
799 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
800 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
801 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
802 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm2
803 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
804 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
805 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
806 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
807 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
808 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
809 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
810 ; AVX512BWVL-NEXT: vzeroupper
811 ; AVX512BWVL-NEXT: retq
813 ; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
814 ; AVX512VBMI: # %bb.0:
815 ; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1
816 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
817 ; AVX512VBMI-NEXT: vpshufb %xmm2, %xmm1, %xmm1
818 ; AVX512VBMI-NEXT: vpshufb %xmm2, %xmm0, %xmm2
819 ; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
820 ; AVX512VBMI-NEXT: vextracti64x4 $1, %zmm0, %ymm0
821 ; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm2
822 ; AVX512VBMI-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
823 ; AVX512VBMI-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
824 ; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
825 ; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
826 ; AVX512VBMI-NEXT: vzeroupper
827 ; AVX512VBMI-NEXT: retq
829 ; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
830 ; AVX512VBMIVL: # %bb.0:
831 ; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
832 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
833 ; AVX512VBMIVL-NEXT: vpermi2b %ymm2, %ymm0, %ymm1
834 ; AVX512VBMIVL-NEXT: vmovdqa %xmm1, %xmm0
835 ; AVX512VBMIVL-NEXT: vzeroupper
836 ; AVX512VBMIVL-NEXT: retq
837 %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 62>
841 define <4 x double> @PR34175(<32 x i16>* %p) {
842 ; AVX512F-LABEL: PR34175:
844 ; AVX512F-NEXT: vmovdqu (%rdi), %xmm0
845 ; AVX512F-NEXT: vmovdqu 32(%rdi), %xmm1
846 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
847 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
848 ; AVX512F-NEXT: vpbroadcastd %xmm1, %xmm1
849 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
850 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
851 ; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0
854 ; AVX512VL-LABEL: PR34175:
856 ; AVX512VL-NEXT: vmovdqu (%rdi), %xmm0
857 ; AVX512VL-NEXT: vmovdqu 32(%rdi), %xmm1
858 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
859 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
860 ; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1
861 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
862 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
863 ; AVX512VL-NEXT: vcvtdq2pd %xmm0, %ymm0
864 ; AVX512VL-NEXT: retq
866 ; AVX512BW-LABEL: PR34175:
868 ; AVX512BW-NEXT: vmovdqu (%rdi), %xmm0
869 ; AVX512BW-NEXT: vmovdqu 32(%rdi), %xmm1
870 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
871 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
872 ; AVX512BW-NEXT: vpbroadcastd %xmm1, %xmm1
873 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
874 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
875 ; AVX512BW-NEXT: vcvtdq2pd %xmm0, %ymm0
876 ; AVX512BW-NEXT: retq
878 ; AVX512BWVL-LABEL: PR34175:
879 ; AVX512BWVL: # %bb.0:
880 ; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0
881 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,8,16,24,u,u,u,u,u,u,u,u,u,u,u,u>
882 ; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
883 ; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
884 ; AVX512BWVL-NEXT: vcvtdq2pd %xmm0, %ymm0
885 ; AVX512BWVL-NEXT: retq
887 ; AVX512VBMI-LABEL: PR34175:
888 ; AVX512VBMI: # %bb.0:
889 ; AVX512VBMI-NEXT: vmovdqu (%rdi), %xmm0
890 ; AVX512VBMI-NEXT: vmovdqu 32(%rdi), %xmm1
891 ; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
892 ; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
893 ; AVX512VBMI-NEXT: vpbroadcastd %xmm1, %xmm1
894 ; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
895 ; AVX512VBMI-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
896 ; AVX512VBMI-NEXT: vcvtdq2pd %xmm0, %ymm0
897 ; AVX512VBMI-NEXT: retq
899 ; AVX512VBMIVL-LABEL: PR34175:
900 ; AVX512VBMIVL: # %bb.0:
901 ; AVX512VBMIVL-NEXT: vmovdqu (%rdi), %ymm0
902 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,8,16,24,u,u,u,u,u,u,u,u,u,u,u,u>
903 ; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
904 ; AVX512VBMIVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
905 ; AVX512VBMIVL-NEXT: vcvtdq2pd %xmm0, %ymm0
906 ; AVX512VBMIVL-NEXT: retq
907 %v = load <32 x i16>, <32 x i16>* %p, align 2
908 %shuf = shufflevector <32 x i16> %v, <32 x i16> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
909 %tofp = uitofp <4 x i16> %shuf to <4 x double>
910 ret <4 x double> %tofp
913 define <16 x i8> @trunc_v8i64_to_v8i8_return_v16i8(<8 x i64> %vec) nounwind {
914 ; AVX512-LABEL: trunc_v8i64_to_v8i8_return_v16i8:
916 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
917 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
918 ; AVX512-NEXT: vzeroupper
920 %truncated = trunc <8 x i64> %vec to <8 x i8>
921 %result = shufflevector <8 x i8> %truncated, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
922 ret <16 x i8> %result