1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
8 ; Pairs of shufflevector:trunc functions with functional equivalence.
9 ; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
11 define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
12 ; AVX512F-LABEL: shuffle_v64i8_to_v32i8:
14 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
15 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
16 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
17 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
18 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
19 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
20 ; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
21 ; AVX512F-NEXT: vzeroupper
24 ; AVX512VL-LABEL: shuffle_v64i8_to_v32i8:
26 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
27 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
28 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
29 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
30 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
31 ; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
32 ; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi)
33 ; AVX512VL-NEXT: vzeroupper
36 ; AVX512BW-LABEL: shuffle_v64i8_to_v32i8:
38 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
39 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
40 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
41 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
42 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
43 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
44 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
45 ; AVX512BW-NEXT: vzeroupper
48 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8:
50 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
51 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
52 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
53 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
54 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
55 ; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
56 ; AVX512BWVL-NEXT: vmovdqa %ymm0, (%rsi)
57 ; AVX512BWVL-NEXT: vzeroupper
58 ; AVX512BWVL-NEXT: retq
59 %vec = load <64 x i8>, <64 x i8>* %L
60 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
61 store <32 x i8> %strided.vec, <32 x i8>* %S
65 define void @trunc_v32i16_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
66 ; AVX512F-LABEL: trunc_v32i16_to_v32i8:
68 ; AVX512F-NEXT: vpmovsxwd (%rdi), %zmm0
69 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
70 ; AVX512F-NEXT: vpmovsxwd 32(%rdi), %zmm1
71 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
72 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
73 ; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
74 ; AVX512F-NEXT: vzeroupper
77 ; AVX512VL-LABEL: trunc_v32i16_to_v32i8:
79 ; AVX512VL-NEXT: vpmovsxwd (%rdi), %zmm0
80 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
81 ; AVX512VL-NEXT: vpmovsxwd 32(%rdi), %zmm1
82 ; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1
83 ; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
84 ; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi)
85 ; AVX512VL-NEXT: vzeroupper
88 ; AVX512BW-LABEL: trunc_v32i16_to_v32i8:
90 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
91 ; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi)
92 ; AVX512BW-NEXT: vzeroupper
95 ; AVX512BWVL-LABEL: trunc_v32i16_to_v32i8:
97 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
98 ; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi)
99 ; AVX512BWVL-NEXT: vzeroupper
100 ; AVX512BWVL-NEXT: retq
101 %vec = load <64 x i8>, <64 x i8>* %L
102 %bc = bitcast <64 x i8> %vec to <32 x i16>
103 %strided.vec = trunc <32 x i16> %bc to <32 x i8>
104 store <32 x i8> %strided.vec, <32 x i8>* %S
108 define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
109 ; AVX512F-LABEL: shuffle_v32i16_to_v16i16:
111 ; AVX512F-NEXT: vpshuflw {{.*#+}} ymm0 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
112 ; AVX512F-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
113 ; AVX512F-NEXT: vpshuflw {{.*#+}} ymm1 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
114 ; AVX512F-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
115 ; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
116 ; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
117 ; AVX512F-NEXT: vmovaps %ymm0, (%rsi)
118 ; AVX512F-NEXT: vzeroupper
121 ; AVX512VL-LABEL: shuffle_v32i16_to_v16i16:
123 ; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm0 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
124 ; AVX512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
125 ; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm1 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
126 ; AVX512VL-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
127 ; AVX512VL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
128 ; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
129 ; AVX512VL-NEXT: vmovaps %ymm0, (%rsi)
130 ; AVX512VL-NEXT: vzeroupper
131 ; AVX512VL-NEXT: retq
133 ; AVX512BW-LABEL: shuffle_v32i16_to_v16i16:
135 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
136 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
137 ; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
138 ; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
139 ; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
140 ; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
141 ; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
142 ; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
143 ; AVX512BW-NEXT: vmovaps %ymm0, (%rsi)
144 ; AVX512BW-NEXT: vzeroupper
145 ; AVX512BW-NEXT: retq
147 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16:
148 ; AVX512BWVL: # BB#0:
149 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
150 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
151 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,16,18,20,22,8,10,12,14,24,26,28,30]
152 ; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
153 ; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,2,1,3]
154 ; AVX512BWVL-NEXT: vmovdqa %ymm0, (%rsi)
155 ; AVX512BWVL-NEXT: vzeroupper
156 ; AVX512BWVL-NEXT: retq
157 %vec = load <32 x i16>, <32 x i16>* %L
158 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
159 store <16 x i16> %strided.vec, <16 x i16>* %S
163 define void @trunc_v16i32_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
164 ; AVX512-LABEL: trunc_v16i32_to_v16i16:
166 ; AVX512-NEXT: vmovdqa32 (%rdi), %zmm0
167 ; AVX512-NEXT: vpmovdw %zmm0, (%rsi)
168 ; AVX512-NEXT: vzeroupper
170 %vec = load <32 x i16>, <32 x i16>* %L
171 %bc = bitcast <32 x i16> %vec to <16 x i32>
172 %strided.vec = trunc <16 x i32> %bc to <16 x i16>
173 store <16 x i16> %strided.vec, <16 x i16>* %S
177 define void @shuffle_v16i32_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
178 ; AVX512-LABEL: shuffle_v16i32_to_v8i32:
180 ; AVX512-NEXT: vmovaps (%rdi), %zmm0
181 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
182 ; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
183 ; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
184 ; AVX512-NEXT: vmovaps %ymm0, (%rsi)
185 ; AVX512-NEXT: vzeroupper
187 %vec = load <16 x i32>, <16 x i32>* %L
188 %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
189 store <8 x i32> %strided.vec, <8 x i32>* %S
193 define void @trunc_v8i64_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
194 ; AVX512-LABEL: trunc_v8i64_to_v8i32:
196 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
197 ; AVX512-NEXT: vpmovqd %zmm0, (%rsi)
198 ; AVX512-NEXT: vzeroupper
200 %vec = load <16 x i32>, <16 x i32>* %L
201 %bc = bitcast <16 x i32> %vec to <8 x i64>
202 %strided.vec = trunc <8 x i64> %bc to <8 x i32>
203 store <8 x i32> %strided.vec, <8 x i32>* %S
207 define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
208 ; AVX512F-LABEL: shuffle_v64i8_to_v16i8:
210 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
211 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
212 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
213 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
214 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
215 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
216 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
217 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
218 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
219 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
220 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
221 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
222 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
223 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
224 ; AVX512F-NEXT: vzeroupper
227 ; AVX512VL-LABEL: shuffle_v64i8_to_v16i8:
229 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
230 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
231 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
232 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
233 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
234 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
235 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
236 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
237 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
238 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
239 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
240 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
241 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
242 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
243 ; AVX512VL-NEXT: vzeroupper
244 ; AVX512VL-NEXT: retq
246 ; AVX512BW-LABEL: shuffle_v64i8_to_v16i8:
248 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
249 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
250 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
251 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
252 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
253 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
254 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
255 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
256 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
257 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
258 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
259 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
260 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
261 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
262 ; AVX512BW-NEXT: vzeroupper
263 ; AVX512BW-NEXT: retq
265 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8:
266 ; AVX512BWVL: # BB#0:
267 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
268 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
269 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
270 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
271 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
272 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
273 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
274 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
275 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
276 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
277 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
278 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
279 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
280 ; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi)
281 ; AVX512BWVL-NEXT: vzeroupper
282 ; AVX512BWVL-NEXT: retq
283 %vec = load <64 x i8>, <64 x i8>* %L
284 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
285 store <16 x i8> %strided.vec, <16 x i8>* %S
289 define void @trunc_v16i32_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
290 ; AVX512-LABEL: trunc_v16i32_to_v16i8:
292 ; AVX512-NEXT: vmovdqa32 (%rdi), %zmm0
293 ; AVX512-NEXT: vpmovdb %zmm0, (%rsi)
294 ; AVX512-NEXT: vzeroupper
296 %vec = load <64 x i8>, <64 x i8>* %L
297 %bc = bitcast <64 x i8> %vec to <16 x i32>
298 %strided.vec = trunc <16 x i32> %bc to <16 x i8>
299 store <16 x i8> %strided.vec, <16 x i8>* %S
303 define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
304 ; AVX512F-LABEL: shuffle_v32i16_to_v8i16:
306 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
307 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
308 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
309 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
310 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
311 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
312 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
313 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
314 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
315 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
316 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
317 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
318 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
319 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
320 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
321 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
322 ; AVX512F-NEXT: vzeroupper
325 ; AVX512VL-LABEL: shuffle_v32i16_to_v8i16:
327 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
328 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
329 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
330 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
331 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
332 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
333 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
334 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
335 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
336 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
337 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
338 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
339 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
340 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
341 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
342 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
343 ; AVX512VL-NEXT: vzeroupper
344 ; AVX512VL-NEXT: retq
346 ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16:
348 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
349 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
350 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
351 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
352 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
353 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
354 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
355 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
356 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
357 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
358 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
359 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
360 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
361 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
362 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
363 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
364 ; AVX512BW-NEXT: vzeroupper
365 ; AVX512BW-NEXT: retq
367 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16:
368 ; AVX512BWVL: # BB#0:
369 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
370 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
371 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u>
372 ; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
373 ; AVX512BWVL-NEXT: vmovdqa %xmm2, (%rsi)
374 ; AVX512BWVL-NEXT: vzeroupper
375 ; AVX512BWVL-NEXT: retq
376 %vec = load <32 x i16>, <32 x i16>* %L
377 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
378 store <8 x i16> %strided.vec, <8 x i16>* %S
382 define void @trunc_v8i64_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
383 ; AVX512-LABEL: trunc_v8i64_to_v8i16:
385 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
386 ; AVX512-NEXT: vpmovqw %zmm0, (%rsi)
387 ; AVX512-NEXT: vzeroupper
389 %vec = load <32 x i16>, <32 x i16>* %L
390 %bc = bitcast <32 x i16> %vec to <8 x i64>
391 %strided.vec = trunc <8 x i64> %bc to <8 x i16>
392 store <8 x i16> %strided.vec, <8 x i16>* %S
396 define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
397 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8:
399 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
400 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
401 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
402 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
403 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
404 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
405 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
406 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
407 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
408 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
409 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
410 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
411 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
412 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
413 ; AVX512F-NEXT: vzeroupper
416 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8:
418 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
419 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
420 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
421 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
422 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
423 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
424 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
425 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
426 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
427 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
428 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
429 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
430 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
431 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
432 ; AVX512VL-NEXT: vzeroupper
433 ; AVX512VL-NEXT: retq
435 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8:
437 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
438 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
439 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
440 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
441 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
442 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
443 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
444 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
445 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
446 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
447 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
448 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
449 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
450 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
451 ; AVX512BW-NEXT: vzeroupper
452 ; AVX512BW-NEXT: retq
454 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8:
455 ; AVX512BWVL: # BB#0:
456 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
457 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
458 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u>
459 ; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
460 ; AVX512BWVL-NEXT: vpmovwb %xmm2, (%rsi)
461 ; AVX512BWVL-NEXT: vzeroupper
462 ; AVX512BWVL-NEXT: retq
463 %vec = load <64 x i8>, <64 x i8>* %L
464 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
465 store <8 x i8> %strided.vec, <8 x i8>* %S
469 define void @trunc_v8i64_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
470 ; AVX512-LABEL: trunc_v8i64_to_v8i8:
472 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
473 ; AVX512-NEXT: vpmovqb %zmm0, (%rsi)
474 ; AVX512-NEXT: vzeroupper
476 %vec = load <64 x i8>, <64 x i8>* %L
477 %bc = bitcast <64 x i8> %vec to <8 x i64>
478 %strided.vec = trunc <8 x i64> %bc to <8 x i8>
479 store <8 x i8> %strided.vec, <8 x i8>* %S
483 define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61(<64 x i8> %x) {
484 ; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
486 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
487 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
488 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
489 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
490 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
491 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
492 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
493 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
494 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
495 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
496 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
497 ; AVX512F-NEXT: vzeroupper
500 ; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
502 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
503 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
504 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
505 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
506 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
507 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
508 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
509 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
510 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
511 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
512 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
513 ; AVX512VL-NEXT: vzeroupper
514 ; AVX512VL-NEXT: retq
516 ; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
518 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
519 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
520 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
521 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
522 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
523 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
524 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
525 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
526 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
527 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
528 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
529 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
530 ; AVX512BW-NEXT: vzeroupper
531 ; AVX512BW-NEXT: retq
533 ; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
534 ; AVX512BWVL: # BB#0:
535 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
536 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
537 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
538 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
539 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
540 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
541 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
542 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
543 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
544 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
545 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
546 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
547 ; AVX512BWVL-NEXT: vzeroupper
548 ; AVX512BWVL-NEXT: retq
549 %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
553 define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62(<64 x i8> %x) {
554 ; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
556 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
557 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
558 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
559 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
560 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
561 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
562 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
563 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
564 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
565 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
566 ; AVX512F-NEXT: vzeroupper
569 ; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
571 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
572 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
573 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
574 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
575 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
576 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
577 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
578 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
579 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
580 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
581 ; AVX512VL-NEXT: vzeroupper
582 ; AVX512VL-NEXT: retq
584 ; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
586 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
587 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
588 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
589 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm2
590 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
591 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0
592 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
593 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
594 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
595 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
596 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
597 ; AVX512BW-NEXT: vzeroupper
598 ; AVX512BW-NEXT: retq
600 ; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
601 ; AVX512BWVL: # BB#0:
602 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
603 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
604 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
605 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm2
606 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
607 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
608 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
609 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
610 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
611 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
612 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
613 ; AVX512BWVL-NEXT: vzeroupper
614 ; AVX512BWVL-NEXT: retq
615 %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 62>
619 define <4 x double> @PR34175(<32 x i16>* %p) {
620 ; AVX512F-LABEL: PR34175:
622 ; AVX512F-NEXT: vmovdqu (%rdi), %ymm0
623 ; AVX512F-NEXT: vmovdqu 32(%rdi), %ymm1
624 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
625 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
626 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
627 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
628 ; AVX512F-NEXT: vpbroadcastd %xmm1, %xmm1
629 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
630 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
631 ; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0
634 ; AVX512VL-LABEL: PR34175:
636 ; AVX512VL-NEXT: vmovdqu (%rdi), %ymm0
637 ; AVX512VL-NEXT: vmovdqu 32(%rdi), %ymm1
638 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
639 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
640 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
641 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
642 ; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1
643 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
644 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
645 ; AVX512VL-NEXT: vcvtdq2pd %xmm0, %ymm0
646 ; AVX512VL-NEXT: retq
648 ; AVX512BW-LABEL: PR34175:
650 ; AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0
651 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
652 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
653 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0
654 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
655 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
656 ; AVX512BW-NEXT: vpbroadcastd %xmm0, %xmm0
657 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
658 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
659 ; AVX512BW-NEXT: vcvtdq2pd %xmm0, %ymm0
660 ; AVX512BW-NEXT: retq
662 ; AVX512BWVL-LABEL: PR34175:
663 ; AVX512BWVL: # BB#0:
664 ; AVX512BWVL-NEXT: vmovdqu64 (%rdi), %zmm0
665 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
666 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,8,16,24,u,u,u,u,u,u,u,u,u,u,u,u>
667 ; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
668 ; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
669 ; AVX512BWVL-NEXT: vcvtdq2pd %xmm0, %ymm0
670 ; AVX512BWVL-NEXT: retq
671 %v = load <32 x i16>, <32 x i16>* %p, align 2
672 %shuf = shufflevector <32 x i16> %v, <32 x i16> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
673 %tofp = uitofp <4 x i16> %shuf to <4 x double>
674 ret <4 x double> %tofp