1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
3 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
4 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
5 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
6 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
7 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMIVL
10 ; Pairs of shufflevector:trunc functions with functional equivalence.
11 ; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
13 define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
14 ; AVX512F-LABEL: shuffle_v64i8_to_v32i8:
16 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
17 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
18 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
19 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
20 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
21 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
22 ; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
23 ; AVX512F-NEXT: vzeroupper
26 ; AVX512VL-LABEL: shuffle_v64i8_to_v32i8:
28 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
29 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
30 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
31 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
32 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
33 ; AVX512VL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
34 ; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi)
35 ; AVX512VL-NEXT: vzeroupper
38 ; AVX512BW-LABEL: shuffle_v64i8_to_v32i8:
40 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
41 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1
42 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
43 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
44 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
45 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
46 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
47 ; AVX512BW-NEXT: vzeroupper
50 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8:
51 ; AVX512BWVL: # %bb.0:
52 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
53 ; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %ymm1
54 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
55 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
56 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
57 ; AVX512BWVL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
58 ; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi)
59 ; AVX512BWVL-NEXT: vzeroupper
60 ; AVX512BWVL-NEXT: retq
62 ; AVX512VBMI-LABEL: shuffle_v64i8_to_v32i8:
63 ; AVX512VBMI: # %bb.0:
64 ; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm0
65 ; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %ymm1
66 ; AVX512VBMI-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
67 ; AVX512VBMI-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
68 ; AVX512VBMI-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
69 ; AVX512VBMI-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
70 ; AVX512VBMI-NEXT: vmovdqa %ymm0, (%rsi)
71 ; AVX512VBMI-NEXT: vzeroupper
72 ; AVX512VBMI-NEXT: retq
74 ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v32i8:
75 ; AVX512VBMIVL: # %bb.0:
76 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
77 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62]
78 ; AVX512VBMIVL-NEXT: vpermi2b 32(%rdi), %ymm0, %ymm1
79 ; AVX512VBMIVL-NEXT: vmovdqa %ymm1, (%rsi)
80 ; AVX512VBMIVL-NEXT: vzeroupper
81 ; AVX512VBMIVL-NEXT: retq
82 %vec = load <64 x i8>, <64 x i8>* %L
83 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
84 store <32 x i8> %strided.vec, <32 x i8>* %S
88 define void @trunc_v32i16_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
89 ; AVX512F-LABEL: trunc_v32i16_to_v32i8:
91 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
92 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
93 ; AVX512F-NEXT: vpmovdb %zmm1, 16(%rsi)
94 ; AVX512F-NEXT: vpmovdb %zmm0, (%rsi)
95 ; AVX512F-NEXT: vzeroupper
98 ; AVX512VL-LABEL: trunc_v32i16_to_v32i8:
100 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
101 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
102 ; AVX512VL-NEXT: vpmovdb %zmm1, 16(%rsi)
103 ; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi)
104 ; AVX512VL-NEXT: vzeroupper
105 ; AVX512VL-NEXT: retq
107 ; AVX512BW-LABEL: trunc_v32i16_to_v32i8:
109 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
110 ; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi)
111 ; AVX512BW-NEXT: vzeroupper
112 ; AVX512BW-NEXT: retq
114 ; AVX512BWVL-LABEL: trunc_v32i16_to_v32i8:
115 ; AVX512BWVL: # %bb.0:
116 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
117 ; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi)
118 ; AVX512BWVL-NEXT: vzeroupper
119 ; AVX512BWVL-NEXT: retq
121 ; AVX512VBMI-LABEL: trunc_v32i16_to_v32i8:
122 ; AVX512VBMI: # %bb.0:
123 ; AVX512VBMI-NEXT: vmovdqa64 (%rdi), %zmm0
124 ; AVX512VBMI-NEXT: vpmovwb %zmm0, (%rsi)
125 ; AVX512VBMI-NEXT: vzeroupper
126 ; AVX512VBMI-NEXT: retq
128 ; AVX512VBMIVL-LABEL: trunc_v32i16_to_v32i8:
129 ; AVX512VBMIVL: # %bb.0:
130 ; AVX512VBMIVL-NEXT: vmovdqa64 (%rdi), %zmm0
131 ; AVX512VBMIVL-NEXT: vpmovwb %zmm0, (%rsi)
132 ; AVX512VBMIVL-NEXT: vzeroupper
133 ; AVX512VBMIVL-NEXT: retq
134 %vec = load <64 x i8>, <64 x i8>* %L
135 %bc = bitcast <64 x i8> %vec to <32 x i16>
136 %strided.vec = trunc <32 x i16> %bc to <32 x i8>
137 store <32 x i8> %strided.vec, <32 x i8>* %S
141 define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
142 ; AVX512F-LABEL: shuffle_v32i16_to_v16i16:
144 ; AVX512F-NEXT: vpshuflw {{.*#+}} ymm0 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
145 ; AVX512F-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
146 ; AVX512F-NEXT: vpshuflw {{.*#+}} ymm1 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
147 ; AVX512F-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
148 ; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
149 ; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
150 ; AVX512F-NEXT: vmovaps %ymm0, (%rsi)
151 ; AVX512F-NEXT: vzeroupper
154 ; AVX512VL-LABEL: shuffle_v32i16_to_v16i16:
156 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
157 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
158 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31]
159 ; AVX512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1
160 ; AVX512VL-NEXT: vpshufb %ymm2, %ymm0, %ymm0
161 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14]
162 ; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
163 ; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi)
164 ; AVX512VL-NEXT: vzeroupper
165 ; AVX512VL-NEXT: retq
167 ; AVX512BW-LABEL: shuffle_v32i16_to_v16i16:
169 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
170 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1
171 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31]
172 ; AVX512BW-NEXT: vpshufb %ymm2, %ymm1, %ymm1
173 ; AVX512BW-NEXT: vpshufb %ymm2, %ymm0, %ymm0
174 ; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
175 ; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
176 ; AVX512BW-NEXT: vmovaps %ymm0, (%rsi)
177 ; AVX512BW-NEXT: vzeroupper
178 ; AVX512BW-NEXT: retq
180 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16:
181 ; AVX512BWVL: # %bb.0:
182 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
183 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
184 ; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
185 ; AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi)
186 ; AVX512BWVL-NEXT: vzeroupper
187 ; AVX512BWVL-NEXT: retq
189 ; AVX512VBMI-LABEL: shuffle_v32i16_to_v16i16:
190 ; AVX512VBMI: # %bb.0:
191 ; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm0
192 ; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %ymm1
193 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31]
194 ; AVX512VBMI-NEXT: vpshufb %ymm2, %ymm1, %ymm1
195 ; AVX512VBMI-NEXT: vpshufb %ymm2, %ymm0, %ymm0
196 ; AVX512VBMI-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
197 ; AVX512VBMI-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
198 ; AVX512VBMI-NEXT: vmovaps %ymm0, (%rsi)
199 ; AVX512VBMI-NEXT: vzeroupper
200 ; AVX512VBMI-NEXT: retq
202 ; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v16i16:
203 ; AVX512VBMIVL: # %bb.0:
204 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
205 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
206 ; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
207 ; AVX512VBMIVL-NEXT: vmovdqa %ymm1, (%rsi)
208 ; AVX512VBMIVL-NEXT: vzeroupper
209 ; AVX512VBMIVL-NEXT: retq
210 %vec = load <32 x i16>, <32 x i16>* %L
211 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
212 store <16 x i16> %strided.vec, <16 x i16>* %S
216 define void @trunc_v16i32_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
217 ; AVX512-LABEL: trunc_v16i32_to_v16i16:
219 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
220 ; AVX512-NEXT: vpmovdw %zmm0, (%rsi)
221 ; AVX512-NEXT: vzeroupper
223 %vec = load <32 x i16>, <32 x i16>* %L
224 %bc = bitcast <32 x i16> %vec to <16 x i32>
225 %strided.vec = trunc <16 x i32> %bc to <16 x i16>
226 store <16 x i16> %strided.vec, <16 x i16>* %S
230 define void @shuffle_v16i32_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
231 ; AVX512F-LABEL: shuffle_v16i32_to_v8i32:
233 ; AVX512F-NEXT: vmovaps (%rdi), %ymm0
234 ; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6]
235 ; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
236 ; AVX512F-NEXT: vmovaps %ymm0, (%rsi)
237 ; AVX512F-NEXT: vzeroupper
240 ; AVX512VL-LABEL: shuffle_v16i32_to_v8i32:
242 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
243 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14]
244 ; AVX512VL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1
245 ; AVX512VL-NEXT: vmovdqa %ymm1, (%rsi)
246 ; AVX512VL-NEXT: vzeroupper
247 ; AVX512VL-NEXT: retq
249 ; AVX512BW-LABEL: shuffle_v16i32_to_v8i32:
251 ; AVX512BW-NEXT: vmovaps (%rdi), %ymm0
252 ; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6]
253 ; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
254 ; AVX512BW-NEXT: vmovaps %ymm0, (%rsi)
255 ; AVX512BW-NEXT: vzeroupper
256 ; AVX512BW-NEXT: retq
258 ; AVX512BWVL-LABEL: shuffle_v16i32_to_v8i32:
259 ; AVX512BWVL: # %bb.0:
260 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
261 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14]
262 ; AVX512BWVL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1
263 ; AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi)
264 ; AVX512BWVL-NEXT: vzeroupper
265 ; AVX512BWVL-NEXT: retq
267 ; AVX512VBMI-LABEL: shuffle_v16i32_to_v8i32:
268 ; AVX512VBMI: # %bb.0:
269 ; AVX512VBMI-NEXT: vmovaps (%rdi), %ymm0
270 ; AVX512VBMI-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6]
271 ; AVX512VBMI-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
272 ; AVX512VBMI-NEXT: vmovaps %ymm0, (%rsi)
273 ; AVX512VBMI-NEXT: vzeroupper
274 ; AVX512VBMI-NEXT: retq
276 ; AVX512VBMIVL-LABEL: shuffle_v16i32_to_v8i32:
277 ; AVX512VBMIVL: # %bb.0:
278 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
279 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14]
280 ; AVX512VBMIVL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1
281 ; AVX512VBMIVL-NEXT: vmovdqa %ymm1, (%rsi)
282 ; AVX512VBMIVL-NEXT: vzeroupper
283 ; AVX512VBMIVL-NEXT: retq
284 %vec = load <16 x i32>, <16 x i32>* %L
285 %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
286 store <8 x i32> %strided.vec, <8 x i32>* %S
290 define void @trunc_v8i64_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
291 ; AVX512-LABEL: trunc_v8i64_to_v8i32:
293 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
294 ; AVX512-NEXT: vpmovqd %zmm0, (%rsi)
295 ; AVX512-NEXT: vzeroupper
297 %vec = load <16 x i32>, <16 x i32>* %L
298 %bc = bitcast <16 x i32> %vec to <8 x i64>
299 %strided.vec = trunc <8 x i64> %bc to <8 x i32>
300 store <8 x i32> %strided.vec, <8 x i32>* %S
304 define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
305 ; AVX512F-LABEL: shuffle_v64i8_to_v16i8:
307 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
308 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
309 ; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
310 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
311 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
312 ; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
313 ; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
314 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
315 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
316 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
317 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
318 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
319 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
320 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
323 ; AVX512VL-LABEL: shuffle_v64i8_to_v16i8:
325 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
326 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
327 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
328 ; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
329 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
330 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
331 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
332 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
333 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
334 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
335 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
336 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
337 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
338 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
339 ; AVX512VL-NEXT: retq
341 ; AVX512BW-LABEL: shuffle_v64i8_to_v16i8:
343 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
344 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
345 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
346 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
347 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
348 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
349 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
350 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
351 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
352 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
353 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
354 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
355 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
356 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
357 ; AVX512BW-NEXT: retq
359 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8:
360 ; AVX512BWVL: # %bb.0:
361 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
362 ; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
363 ; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2
364 ; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3
365 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
366 ; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
367 ; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
368 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
369 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
370 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
371 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
372 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
373 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
374 ; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi)
375 ; AVX512BWVL-NEXT: retq
377 ; AVX512VBMI-LABEL: shuffle_v64i8_to_v16i8:
378 ; AVX512VBMI: # %bb.0:
379 ; AVX512VBMI-NEXT: vmovdqa (%rdi), %xmm0
380 ; AVX512VBMI-NEXT: vmovdqa 16(%rdi), %xmm1
381 ; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2
382 ; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm3
383 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
384 ; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm3, %xmm3
385 ; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm2, %xmm2
386 ; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
387 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
388 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1
389 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0
390 ; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
391 ; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
392 ; AVX512VBMI-NEXT: vmovdqa %xmm0, (%rsi)
393 ; AVX512VBMI-NEXT: retq
395 ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v16i8:
396 ; AVX512VBMIVL: # %bb.0:
397 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
398 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm1
399 ; AVX512VBMIVL-NEXT: vpermt2b 32(%rdi), %ymm0, %ymm1
400 ; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
401 ; AVX512VBMIVL-NEXT: vzeroupper
402 ; AVX512VBMIVL-NEXT: retq
403 %vec = load <64 x i8>, <64 x i8>* %L
404 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
405 store <16 x i8> %strided.vec, <16 x i8>* %S
409 define void @trunc_v16i32_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
410 ; AVX512-LABEL: trunc_v16i32_to_v16i8:
412 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
413 ; AVX512-NEXT: vpmovdb %zmm0, (%rsi)
414 ; AVX512-NEXT: vzeroupper
416 %vec = load <64 x i8>, <64 x i8>* %L
417 %bc = bitcast <64 x i8> %vec to <16 x i32>
418 %strided.vec = trunc <16 x i32> %bc to <16 x i8>
419 store <16 x i8> %strided.vec, <16 x i8>* %S
423 define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
424 ; AVX512F-LABEL: shuffle_v32i16_to_v8i16:
426 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
427 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
428 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
429 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
430 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
431 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
432 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
433 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = mem[0,2,2,3]
434 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
435 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
436 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
437 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
440 ; AVX512VL-LABEL: shuffle_v32i16_to_v8i16:
442 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
443 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
444 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
445 ; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
446 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
447 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
448 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
449 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
450 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
451 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
452 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
453 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
454 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
455 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
456 ; AVX512VL-NEXT: retq
458 ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16:
460 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
461 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
462 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
463 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
464 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
465 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
466 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
467 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
468 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
469 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
470 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
471 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
472 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
473 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
474 ; AVX512BW-NEXT: retq
476 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16:
477 ; AVX512BWVL: # %bb.0:
478 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28]
479 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
480 ; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
481 ; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi)
482 ; AVX512BWVL-NEXT: vzeroupper
483 ; AVX512BWVL-NEXT: retq
485 ; AVX512VBMI-LABEL: shuffle_v32i16_to_v8i16:
486 ; AVX512VBMI: # %bb.0:
487 ; AVX512VBMI-NEXT: vmovdqa (%rdi), %xmm0
488 ; AVX512VBMI-NEXT: vmovdqa 16(%rdi), %xmm1
489 ; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2
490 ; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm3
491 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
492 ; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm3, %xmm3
493 ; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm2, %xmm2
494 ; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
495 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
496 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1
497 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0
498 ; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
499 ; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
500 ; AVX512VBMI-NEXT: vmovdqa %xmm0, (%rsi)
501 ; AVX512VBMI-NEXT: retq
503 ; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v8i16:
504 ; AVX512VBMIVL: # %bb.0:
505 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28]
506 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm1
507 ; AVX512VBMIVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
508 ; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
509 ; AVX512VBMIVL-NEXT: vzeroupper
510 ; AVX512VBMIVL-NEXT: retq
511 %vec = load <32 x i16>, <32 x i16>* %L
512 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
513 store <8 x i16> %strided.vec, <8 x i16>* %S
517 define void @trunc_v8i64_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
518 ; AVX512-LABEL: trunc_v8i64_to_v8i16:
520 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
521 ; AVX512-NEXT: vpmovqw %zmm0, (%rsi)
522 ; AVX512-NEXT: vzeroupper
524 %vec = load <32 x i16>, <32 x i16>* %L
525 %bc = bitcast <32 x i16> %vec to <8 x i64>
526 %strided.vec = trunc <8 x i64> %bc to <8 x i16>
527 store <8 x i16> %strided.vec, <8 x i16>* %S
531 define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
532 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8:
534 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
535 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
536 ; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
537 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
538 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
539 ; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
540 ; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
541 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
542 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
543 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
544 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
545 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
546 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
547 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
550 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8:
552 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
553 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
554 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
555 ; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
556 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
557 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
558 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
559 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
560 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
561 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
562 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
563 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
564 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
565 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
566 ; AVX512VL-NEXT: retq
568 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8:
570 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
571 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
572 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
573 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
574 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
575 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
576 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
577 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
578 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
579 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
580 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
581 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
582 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
583 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
584 ; AVX512BW-NEXT: retq
586 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8:
587 ; AVX512BWVL: # %bb.0:
588 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
589 ; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
590 ; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2
591 ; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3
592 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
593 ; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
594 ; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
595 ; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
596 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
597 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
598 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
599 ; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
600 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
601 ; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi)
602 ; AVX512BWVL-NEXT: retq
604 ; AVX512VBMI-LABEL: shuffle_v64i8_to_v8i8:
605 ; AVX512VBMI: # %bb.0:
606 ; AVX512VBMI-NEXT: vmovdqa (%rdi), %xmm0
607 ; AVX512VBMI-NEXT: vmovdqa 16(%rdi), %xmm1
608 ; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2
609 ; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm3
610 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
611 ; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm3, %xmm3
612 ; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm2, %xmm2
613 ; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
614 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
615 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1
616 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0
617 ; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
618 ; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
619 ; AVX512VBMI-NEXT: vmovq %xmm0, (%rsi)
620 ; AVX512VBMI-NEXT: retq
622 ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v8i8:
623 ; AVX512VBMIVL: # %bb.0:
624 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
625 ; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4048780183313844224,4048780183313844224,4048780183313844224,4048780183313844224]
626 ; AVX512VBMIVL-NEXT: vpermi2b 32(%rdi), %ymm0, %ymm1
627 ; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi)
628 ; AVX512VBMIVL-NEXT: vzeroupper
629 ; AVX512VBMIVL-NEXT: retq
630 %vec = load <64 x i8>, <64 x i8>* %L
631 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
632 store <8 x i8> %strided.vec, <8 x i8>* %S
636 define void @trunc_v8i64_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
637 ; AVX512-LABEL: trunc_v8i64_to_v8i8:
639 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
640 ; AVX512-NEXT: vpmovqb %zmm0, (%rsi)
641 ; AVX512-NEXT: vzeroupper
643 %vec = load <64 x i8>, <64 x i8>* %L
644 %bc = bitcast <64 x i8> %vec to <8 x i64>
645 %strided.vec = trunc <8 x i64> %bc to <8 x i8>
646 store <8 x i8> %strided.vec, <8 x i8>* %S
650 define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61(<64 x i8> %x) {
651 ; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
653 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
654 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
655 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
656 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
657 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
658 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
659 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
660 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
661 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
662 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
663 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
664 ; AVX512F-NEXT: vzeroupper
667 ; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
669 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
670 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
671 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
672 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
673 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
674 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
675 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
676 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
677 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
678 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
679 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
680 ; AVX512VL-NEXT: vzeroupper
681 ; AVX512VL-NEXT: retq
683 ; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
685 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
686 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
687 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
688 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
689 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
690 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
691 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
692 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
693 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
694 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
695 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
696 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
697 ; AVX512BW-NEXT: vzeroupper
698 ; AVX512BW-NEXT: retq
700 ; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
701 ; AVX512BWVL: # %bb.0:
702 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
703 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
704 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
705 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
706 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
707 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
708 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
709 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
710 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
711 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
712 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
713 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
714 ; AVX512BWVL-NEXT: vzeroupper
715 ; AVX512BWVL-NEXT: retq
717 ; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
718 ; AVX512VBMI: # %bb.0:
719 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
720 ; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
721 ; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
722 ; AVX512VBMI-NEXT: vzeroupper
723 ; AVX512VBMI-NEXT: retq
725 ; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
726 ; AVX512VBMIVL: # %bb.0:
727 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
728 ; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
729 ; AVX512VBMIVL-NEXT: vpermt2b %ymm2, %ymm1, %ymm0
730 ; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
731 ; AVX512VBMIVL-NEXT: vzeroupper
732 ; AVX512VBMIVL-NEXT: retq
733 %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
737 define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62(<64 x i8> %x) {
738 ; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
740 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
741 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
742 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
743 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
744 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
745 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
746 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
747 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
748 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
749 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
750 ; AVX512F-NEXT: vzeroupper
753 ; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
755 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
756 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
757 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
758 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
759 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
760 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
761 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
762 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
763 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
764 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
765 ; AVX512VL-NEXT: vzeroupper
766 ; AVX512VL-NEXT: retq
768 ; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
770 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
771 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
772 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
773 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm2
774 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
775 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0
776 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
777 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
778 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
779 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
780 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
781 ; AVX512BW-NEXT: vzeroupper
782 ; AVX512BW-NEXT: retq
784 ; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
785 ; AVX512BWVL: # %bb.0:
786 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
787 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
788 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
789 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm2
790 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
791 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
792 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
793 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
794 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
795 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
796 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
797 ; AVX512BWVL-NEXT: vzeroupper
798 ; AVX512BWVL-NEXT: retq
800 ; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
801 ; AVX512VBMI: # %bb.0:
802 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62]
803 ; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
804 ; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
805 ; AVX512VBMI-NEXT: vzeroupper
806 ; AVX512VBMI-NEXT: retq
808 ; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
809 ; AVX512VBMIVL: # %bb.0:
810 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62]
811 ; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
812 ; AVX512VBMIVL-NEXT: vpermt2b %ymm2, %ymm1, %ymm0
813 ; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
814 ; AVX512VBMIVL-NEXT: vzeroupper
815 ; AVX512VBMIVL-NEXT: retq
816 %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 62>
820 define <4 x double> @PR34175(<32 x i16>* %p) {
821 ; AVX512F-LABEL: PR34175:
823 ; AVX512F-NEXT: vmovdqu (%rdi), %xmm0
824 ; AVX512F-NEXT: vmovdqu 32(%rdi), %xmm1
825 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
826 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
827 ; AVX512F-NEXT: vpbroadcastd %xmm1, %xmm1
828 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
829 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
830 ; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0
833 ; AVX512VL-LABEL: PR34175:
835 ; AVX512VL-NEXT: vmovdqu (%rdi), %xmm0
836 ; AVX512VL-NEXT: vmovdqu 32(%rdi), %xmm1
837 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
838 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
839 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,2,3]
840 ; AVX512VL-NEXT: vpermi2d %xmm1, %xmm0, %xmm2
841 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
842 ; AVX512VL-NEXT: vcvtdq2pd %xmm0, %ymm0
843 ; AVX512VL-NEXT: retq
845 ; AVX512BW-LABEL: PR34175:
847 ; AVX512BW-NEXT: vmovdqu (%rdi), %xmm0
848 ; AVX512BW-NEXT: vmovdqu 32(%rdi), %xmm1
849 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
850 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
851 ; AVX512BW-NEXT: vpbroadcastd %xmm1, %xmm1
852 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
853 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
854 ; AVX512BW-NEXT: vcvtdq2pd %xmm0, %ymm0
855 ; AVX512BW-NEXT: retq
857 ; AVX512BWVL-LABEL: PR34175:
858 ; AVX512BWVL: # %bb.0:
859 ; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0
860 ; AVX512BWVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6755468161056768,6755468161056768,6755468161056768,6755468161056768]
861 ; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
862 ; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
863 ; AVX512BWVL-NEXT: vcvtdq2pd %xmm0, %ymm0
864 ; AVX512BWVL-NEXT: retq
866 ; AVX512VBMI-LABEL: PR34175:
867 ; AVX512VBMI: # %bb.0:
868 ; AVX512VBMI-NEXT: vmovdqu (%rdi), %xmm0
869 ; AVX512VBMI-NEXT: vmovdqu 32(%rdi), %xmm1
870 ; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
871 ; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
872 ; AVX512VBMI-NEXT: vpbroadcastd %xmm1, %xmm1
873 ; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
874 ; AVX512VBMI-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
875 ; AVX512VBMI-NEXT: vcvtdq2pd %xmm0, %ymm0
876 ; AVX512VBMI-NEXT: retq
878 ; AVX512VBMIVL-LABEL: PR34175:
879 ; AVX512VBMIVL: # %bb.0:
880 ; AVX512VBMIVL-NEXT: vmovdqu (%rdi), %ymm0
881 ; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6755468161056768,6755468161056768,6755468161056768,6755468161056768]
882 ; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
883 ; AVX512VBMIVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
884 ; AVX512VBMIVL-NEXT: vcvtdq2pd %xmm0, %ymm0
885 ; AVX512VBMIVL-NEXT: retq
886 %v = load <32 x i16>, <32 x i16>* %p, align 2
887 %shuf = shufflevector <32 x i16> %v, <32 x i16> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
888 %tofp = uitofp <4 x i16> %shuf to <4 x double>
889 ret <4 x double> %tofp
892 define <16 x i8> @trunc_v8i64_to_v8i8_return_v16i8(<8 x i64> %vec) nounwind {
893 ; AVX512-LABEL: trunc_v8i64_to_v8i8_return_v16i8:
895 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
896 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
897 ; AVX512-NEXT: vzeroupper
899 %truncated = trunc <8 x i64> %vec to <8 x i8>
900 %result = shufflevector <8 x i8> %truncated, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
901 ret <16 x i8> %result