1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMIVL
10 ; Pairs of shufflevector:trunc functions with functional equivalence.
11 ; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
13 define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
14 ; AVX512F-LABEL: shuffle_v64i8_to_v32i8:
16 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
17 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
18 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
19 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
20 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
21 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
22 ; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
23 ; AVX512F-NEXT: vzeroupper
26 ; AVX512VL-LABEL: shuffle_v64i8_to_v32i8:
28 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
29 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
30 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
31 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
32 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
33 ; AVX512VL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
34 ; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi)
35 ; AVX512VL-NEXT: vzeroupper
38 ; AVX512BW-LABEL: shuffle_v64i8_to_v32i8:
40 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
41 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1
42 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
43 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
44 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
45 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
46 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
47 ; AVX512BW-NEXT: vzeroupper
50 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8:
51 ; AVX512BWVL: # %bb.0:
52 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
53 ; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %ymm1
54 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
55 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
56 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
57 ; AVX512BWVL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
58 ; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi)
59 ; AVX512BWVL-NEXT: vzeroupper
60 ; AVX512BWVL-NEXT: retq
62 ; AVX512VBMI-LABEL: shuffle_v64i8_to_v32i8:
63 ; AVX512VBMI: # %bb.0:
64 ; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm0
65 ; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %ymm1
66 ; AVX512VBMI-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
67 ; AVX512VBMI-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
68 ; AVX512VBMI-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
69 ; AVX512VBMI-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
70 ; AVX512VBMI-NEXT: vmovdqa %ymm0, (%rsi)
71 ; AVX512VBMI-NEXT: vzeroupper
72 ; AVX512VBMI-NEXT: retq
74 ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v32i8:
75 ; AVX512VBMIVL: # %bb.0:
76 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
77 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62]
78 ; AVX512VBMIVL-NEXT: vpermi2b 32(%rdi), %ymm0, %ymm1
79 ; AVX512VBMIVL-NEXT: vmovdqa %ymm1, (%rsi)
80 ; AVX512VBMIVL-NEXT: vzeroupper
81 ; AVX512VBMIVL-NEXT: retq
82 %vec = load <64 x i8>, <64 x i8>* %L
83 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
84 store <32 x i8> %strided.vec, <32 x i8>* %S
88 define void @trunc_v32i16_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
89 ; AVX512F-LABEL: trunc_v32i16_to_v32i8:
91 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
92 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
93 ; AVX512F-NEXT: vpmovdb %zmm1, 16(%rsi)
94 ; AVX512F-NEXT: vpmovdb %zmm0, (%rsi)
95 ; AVX512F-NEXT: vzeroupper
98 ; AVX512VL-LABEL: trunc_v32i16_to_v32i8:
100 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
101 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
102 ; AVX512VL-NEXT: vpmovdb %zmm1, 16(%rsi)
103 ; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi)
104 ; AVX512VL-NEXT: vzeroupper
105 ; AVX512VL-NEXT: retq
107 ; AVX512BW-LABEL: trunc_v32i16_to_v32i8:
109 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
110 ; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi)
111 ; AVX512BW-NEXT: vzeroupper
112 ; AVX512BW-NEXT: retq
114 ; AVX512BWVL-LABEL: trunc_v32i16_to_v32i8:
115 ; AVX512BWVL: # %bb.0:
116 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
117 ; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi)
118 ; AVX512BWVL-NEXT: vzeroupper
119 ; AVX512BWVL-NEXT: retq
121 ; AVX512VBMI-LABEL: trunc_v32i16_to_v32i8:
122 ; AVX512VBMI: # %bb.0:
123 ; AVX512VBMI-NEXT: vmovdqa64 (%rdi), %zmm0
124 ; AVX512VBMI-NEXT: vpmovwb %zmm0, (%rsi)
125 ; AVX512VBMI-NEXT: vzeroupper
126 ; AVX512VBMI-NEXT: retq
128 ; AVX512VBMIVL-LABEL: trunc_v32i16_to_v32i8:
129 ; AVX512VBMIVL: # %bb.0:
130 ; AVX512VBMIVL-NEXT: vmovdqa64 (%rdi), %zmm0
131 ; AVX512VBMIVL-NEXT: vpmovwb %zmm0, (%rsi)
132 ; AVX512VBMIVL-NEXT: vzeroupper
133 ; AVX512VBMIVL-NEXT: retq
134 %vec = load <64 x i8>, <64 x i8>* %L
135 %bc = bitcast <64 x i8> %vec to <32 x i16>
136 %strided.vec = trunc <32 x i16> %bc to <32 x i8>
137 store <32 x i8> %strided.vec, <32 x i8>* %S
141 define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
142 ; AVX512F-LABEL: shuffle_v32i16_to_v16i16:
144 ; AVX512F-NEXT: vpshuflw {{.*#+}} ymm0 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
145 ; AVX512F-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
146 ; AVX512F-NEXT: vpshuflw {{.*#+}} ymm1 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
147 ; AVX512F-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
148 ; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
149 ; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
150 ; AVX512F-NEXT: vmovaps %ymm0, (%rsi)
151 ; AVX512F-NEXT: vzeroupper
154 ; AVX512VL-LABEL: shuffle_v32i16_to_v16i16:
156 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
157 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
158 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31]
159 ; AVX512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1
160 ; AVX512VL-NEXT: vpshufb %ymm2, %ymm0, %ymm0
161 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14]
162 ; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
163 ; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi)
164 ; AVX512VL-NEXT: vzeroupper
165 ; AVX512VL-NEXT: retq
167 ; AVX512BW-LABEL: shuffle_v32i16_to_v16i16:
169 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
170 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1
171 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31]
172 ; AVX512BW-NEXT: vpshufb %ymm2, %ymm1, %ymm1
173 ; AVX512BW-NEXT: vpshufb %ymm2, %ymm0, %ymm0
174 ; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
175 ; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
176 ; AVX512BW-NEXT: vmovaps %ymm0, (%rsi)
177 ; AVX512BW-NEXT: vzeroupper
178 ; AVX512BW-NEXT: retq
180 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16:
181 ; AVX512BWVL: # %bb.0:
182 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
183 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
184 ; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
185 ; AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi)
186 ; AVX512BWVL-NEXT: vzeroupper
187 ; AVX512BWVL-NEXT: retq
189 ; AVX512VBMI-LABEL: shuffle_v32i16_to_v16i16:
190 ; AVX512VBMI: # %bb.0:
191 ; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm0
192 ; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %ymm1
193 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31]
194 ; AVX512VBMI-NEXT: vpshufb %ymm2, %ymm1, %ymm1
195 ; AVX512VBMI-NEXT: vpshufb %ymm2, %ymm0, %ymm0
196 ; AVX512VBMI-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
197 ; AVX512VBMI-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
198 ; AVX512VBMI-NEXT: vmovaps %ymm0, (%rsi)
199 ; AVX512VBMI-NEXT: vzeroupper
200 ; AVX512VBMI-NEXT: retq
202 ; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v16i16:
203 ; AVX512VBMIVL: # %bb.0:
204 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
205 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
206 ; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
207 ; AVX512VBMIVL-NEXT: vmovdqa %ymm1, (%rsi)
208 ; AVX512VBMIVL-NEXT: vzeroupper
209 ; AVX512VBMIVL-NEXT: retq
210 %vec = load <32 x i16>, <32 x i16>* %L
211 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
212 store <16 x i16> %strided.vec, <16 x i16>* %S
216 define void @trunc_v16i32_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
217 ; AVX512-LABEL: trunc_v16i32_to_v16i16:
219 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
220 ; AVX512-NEXT: vpmovdw %zmm0, (%rsi)
221 ; AVX512-NEXT: vzeroupper
223 %vec = load <32 x i16>, <32 x i16>* %L
224 %bc = bitcast <32 x i16> %vec to <16 x i32>
225 %strided.vec = trunc <16 x i32> %bc to <16 x i16>
226 store <16 x i16> %strided.vec, <16 x i16>* %S
230 define void @shuffle_v16i32_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
231 ; AVX512F-LABEL: shuffle_v16i32_to_v8i32:
233 ; AVX512F-NEXT: vmovaps (%rdi), %ymm0
234 ; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6]
235 ; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
236 ; AVX512F-NEXT: vmovaps %ymm0, (%rsi)
237 ; AVX512F-NEXT: vzeroupper
240 ; AVX512VL-LABEL: shuffle_v16i32_to_v8i32:
242 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
243 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14]
244 ; AVX512VL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1
245 ; AVX512VL-NEXT: vmovdqa %ymm1, (%rsi)
246 ; AVX512VL-NEXT: vzeroupper
247 ; AVX512VL-NEXT: retq
249 ; AVX512BW-LABEL: shuffle_v16i32_to_v8i32:
251 ; AVX512BW-NEXT: vmovaps (%rdi), %ymm0
252 ; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6]
253 ; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
254 ; AVX512BW-NEXT: vmovaps %ymm0, (%rsi)
255 ; AVX512BW-NEXT: vzeroupper
256 ; AVX512BW-NEXT: retq
258 ; AVX512BWVL-LABEL: shuffle_v16i32_to_v8i32:
259 ; AVX512BWVL: # %bb.0:
260 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
261 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14]
262 ; AVX512BWVL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1
263 ; AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi)
264 ; AVX512BWVL-NEXT: vzeroupper
265 ; AVX512BWVL-NEXT: retq
267 ; AVX512VBMI-LABEL: shuffle_v16i32_to_v8i32:
268 ; AVX512VBMI: # %bb.0:
269 ; AVX512VBMI-NEXT: vmovaps (%rdi), %ymm0
270 ; AVX512VBMI-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6]
271 ; AVX512VBMI-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
272 ; AVX512VBMI-NEXT: vmovaps %ymm0, (%rsi)
273 ; AVX512VBMI-NEXT: vzeroupper
274 ; AVX512VBMI-NEXT: retq
276 ; AVX512VBMIVL-LABEL: shuffle_v16i32_to_v8i32:
277 ; AVX512VBMIVL: # %bb.0:
278 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
279 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14]
280 ; AVX512VBMIVL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1
281 ; AVX512VBMIVL-NEXT: vmovdqa %ymm1, (%rsi)
282 ; AVX512VBMIVL-NEXT: vzeroupper
283 ; AVX512VBMIVL-NEXT: retq
284 %vec = load <16 x i32>, <16 x i32>* %L
285 %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
286 store <8 x i32> %strided.vec, <8 x i32>* %S
290 define void @trunc_v8i64_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
291 ; AVX512-LABEL: trunc_v8i64_to_v8i32:
293 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
294 ; AVX512-NEXT: vpmovqd %zmm0, (%rsi)
295 ; AVX512-NEXT: vzeroupper
297 %vec = load <16 x i32>, <16 x i32>* %L
298 %bc = bitcast <16 x i32> %vec to <8 x i64>
299 %strided.vec = trunc <8 x i64> %bc to <8 x i32>
300 store <8 x i32> %strided.vec, <8 x i32>* %S
304 define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
305 ; AVX512F-LABEL: shuffle_v64i8_to_v16i8:
307 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
308 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
309 ; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
310 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
311 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
312 ; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
313 ; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
314 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
315 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
316 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
317 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
318 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
319 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
320 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
323 ; AVX512VL-LABEL: shuffle_v64i8_to_v16i8:
325 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
326 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
327 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
328 ; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
329 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
330 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
331 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
332 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
333 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
334 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
335 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
336 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
337 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
338 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
339 ; AVX512VL-NEXT: retq
341 ; AVX512BW-LABEL: shuffle_v64i8_to_v16i8:
343 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
344 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
345 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
346 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
347 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
348 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
349 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
350 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
351 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
352 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
353 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
354 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
355 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
356 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
357 ; AVX512BW-NEXT: retq
359 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8:
360 ; AVX512BWVL: # %bb.0:
361 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
362 ; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
363 ; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2
364 ; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3
365 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
366 ; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
367 ; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
368 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
369 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
370 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
371 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
372 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
373 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
374 ; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi)
375 ; AVX512BWVL-NEXT: retq
377 ; AVX512VBMI-LABEL: shuffle_v64i8_to_v16i8:
378 ; AVX512VBMI: # %bb.0:
379 ; AVX512VBMI-NEXT: vmovdqa (%rdi), %xmm0
380 ; AVX512VBMI-NEXT: vmovdqa 16(%rdi), %xmm1
381 ; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2
382 ; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm3
383 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
384 ; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm3, %xmm3
385 ; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm2, %xmm2
386 ; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
387 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
388 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1
389 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0
390 ; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
391 ; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
392 ; AVX512VBMI-NEXT: vmovdqa %xmm0, (%rsi)
393 ; AVX512VBMI-NEXT: retq
395 ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v16i8:
396 ; AVX512VBMIVL: # %bb.0:
397 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
398 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm1
399 ; AVX512VBMIVL-NEXT: vpermt2b 32(%rdi), %ymm0, %ymm1
400 ; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
401 ; AVX512VBMIVL-NEXT: vzeroupper
402 ; AVX512VBMIVL-NEXT: retq
403 %vec = load <64 x i8>, <64 x i8>* %L
404 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
405 store <16 x i8> %strided.vec, <16 x i8>* %S
409 define void @trunc_v16i32_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
410 ; AVX512-LABEL: trunc_v16i32_to_v16i8:
412 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
413 ; AVX512-NEXT: vpmovdb %zmm0, (%rsi)
414 ; AVX512-NEXT: vzeroupper
416 %vec = load <64 x i8>, <64 x i8>* %L
417 %bc = bitcast <64 x i8> %vec to <16 x i32>
418 %strided.vec = trunc <16 x i32> %bc to <16 x i8>
419 store <16 x i8> %strided.vec, <16 x i8>* %S
423 define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
424 ; AVX512F-LABEL: shuffle_v32i16_to_v8i16:
426 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
427 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
428 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
429 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
430 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
431 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
432 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
433 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = mem[0,2,2,3]
434 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
435 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
436 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
437 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
440 ; AVX512VL-LABEL: shuffle_v32i16_to_v8i16:
442 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
443 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
444 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
445 ; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
446 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
447 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
448 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
449 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
450 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
451 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
452 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
453 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
454 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
455 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
456 ; AVX512VL-NEXT: retq
458 ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16:
460 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
461 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
462 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
463 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
464 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
465 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
466 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
467 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
468 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
469 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
470 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
471 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
472 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
473 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
474 ; AVX512BW-NEXT: retq
476 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16:
477 ; AVX512BWVL: # %bb.0:
478 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28]
479 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
480 ; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
481 ; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi)
482 ; AVX512BWVL-NEXT: vzeroupper
483 ; AVX512BWVL-NEXT: retq
485 ; AVX512VBMI-LABEL: shuffle_v32i16_to_v8i16:
486 ; AVX512VBMI: # %bb.0:
487 ; AVX512VBMI-NEXT: vmovdqa (%rdi), %xmm0
488 ; AVX512VBMI-NEXT: vmovdqa 16(%rdi), %xmm1
489 ; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2
490 ; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm3
491 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
492 ; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm3, %xmm3
493 ; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm2, %xmm2
494 ; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
495 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
496 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1
497 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0
498 ; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
499 ; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
500 ; AVX512VBMI-NEXT: vmovdqa %xmm0, (%rsi)
501 ; AVX512VBMI-NEXT: retq
503 ; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v8i16:
504 ; AVX512VBMIVL: # %bb.0:
505 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28]
506 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm1
507 ; AVX512VBMIVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
508 ; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
509 ; AVX512VBMIVL-NEXT: vzeroupper
510 ; AVX512VBMIVL-NEXT: retq
511 %vec = load <32 x i16>, <32 x i16>* %L
512 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
513 store <8 x i16> %strided.vec, <8 x i16>* %S
517 define void @trunc_v8i64_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
518 ; AVX512-LABEL: trunc_v8i64_to_v8i16:
520 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
521 ; AVX512-NEXT: vpmovqw %zmm0, (%rsi)
522 ; AVX512-NEXT: vzeroupper
524 %vec = load <32 x i16>, <32 x i16>* %L
525 %bc = bitcast <32 x i16> %vec to <8 x i64>
526 %strided.vec = trunc <8 x i64> %bc to <8 x i16>
527 store <8 x i16> %strided.vec, <8 x i16>* %S
531 define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
532 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8:
534 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
535 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
536 ; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
537 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
538 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
539 ; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
540 ; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
541 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
542 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
543 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
544 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
545 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
546 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
547 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
550 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8:
552 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
553 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
554 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
555 ; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
556 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
557 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
558 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
559 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
560 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
561 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
562 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
563 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
564 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
565 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
566 ; AVX512VL-NEXT: retq
568 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8:
570 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
571 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
572 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
573 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
574 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
575 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
576 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
577 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
578 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
579 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
580 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
581 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
582 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
583 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
584 ; AVX512BW-NEXT: retq
586 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8:
587 ; AVX512BWVL: # %bb.0:
588 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28]
589 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
590 ; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
591 ; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi)
592 ; AVX512BWVL-NEXT: vzeroupper
593 ; AVX512BWVL-NEXT: retq
595 ; AVX512VBMI-LABEL: shuffle_v64i8_to_v8i8:
596 ; AVX512VBMI: # %bb.0:
597 ; AVX512VBMI-NEXT: vmovdqa (%rdi), %xmm0
598 ; AVX512VBMI-NEXT: vmovdqa 16(%rdi), %xmm1
599 ; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2
600 ; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm3
601 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
602 ; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm3, %xmm3
603 ; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm2, %xmm2
604 ; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
605 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
606 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1
607 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0
608 ; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
609 ; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
610 ; AVX512VBMI-NEXT: vmovq %xmm0, (%rsi)
611 ; AVX512VBMI-NEXT: retq
613 ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v8i8:
614 ; AVX512VBMIVL: # %bb.0:
615 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28]
616 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm1
617 ; AVX512VBMIVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
618 ; AVX512VBMIVL-NEXT: vpmovwb %xmm1, (%rsi)
619 ; AVX512VBMIVL-NEXT: vzeroupper
620 ; AVX512VBMIVL-NEXT: retq
621 %vec = load <64 x i8>, <64 x i8>* %L
622 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
623 store <8 x i8> %strided.vec, <8 x i8>* %S
627 define void @trunc_v8i64_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
628 ; AVX512-LABEL: trunc_v8i64_to_v8i8:
630 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
631 ; AVX512-NEXT: vpmovqb %zmm0, (%rsi)
632 ; AVX512-NEXT: vzeroupper
634 %vec = load <64 x i8>, <64 x i8>* %L
635 %bc = bitcast <64 x i8> %vec to <8 x i64>
636 %strided.vec = trunc <8 x i64> %bc to <8 x i8>
637 store <8 x i8> %strided.vec, <8 x i8>* %S
641 define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61(<64 x i8> %x) {
642 ; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
644 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
645 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
646 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
647 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
648 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
649 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
650 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
651 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
652 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
653 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
654 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
655 ; AVX512F-NEXT: vzeroupper
658 ; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
660 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
661 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
662 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
663 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
664 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
665 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
666 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
667 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
668 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
669 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
670 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
671 ; AVX512VL-NEXT: vzeroupper
672 ; AVX512VL-NEXT: retq
674 ; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
676 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
677 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
678 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
679 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
680 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
681 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
682 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
683 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
684 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
685 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
686 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
687 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
688 ; AVX512BW-NEXT: vzeroupper
689 ; AVX512BW-NEXT: retq
691 ; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
692 ; AVX512BWVL: # %bb.0:
693 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
694 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
695 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
696 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
697 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
698 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
699 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
700 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
701 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
702 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
703 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
704 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
705 ; AVX512BWVL-NEXT: vzeroupper
706 ; AVX512BWVL-NEXT: retq
708 ; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
709 ; AVX512VBMI: # %bb.0:
710 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
711 ; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
712 ; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
713 ; AVX512VBMI-NEXT: vzeroupper
714 ; AVX512VBMI-NEXT: retq
716 ; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
717 ; AVX512VBMIVL: # %bb.0:
718 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
719 ; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
720 ; AVX512VBMIVL-NEXT: vpermt2b %ymm2, %ymm1, %ymm0
721 ; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
722 ; AVX512VBMIVL-NEXT: vzeroupper
723 ; AVX512VBMIVL-NEXT: retq
724 %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
728 define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62(<64 x i8> %x) {
729 ; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
731 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
732 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
733 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
734 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
735 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
736 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
737 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
738 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
739 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
740 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
741 ; AVX512F-NEXT: vzeroupper
744 ; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
746 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
747 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
748 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
749 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
750 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
751 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
752 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
753 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
754 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
755 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
756 ; AVX512VL-NEXT: vzeroupper
757 ; AVX512VL-NEXT: retq
759 ; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
761 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
762 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
763 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
764 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm2
765 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
766 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0
767 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
768 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
769 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
770 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
771 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
772 ; AVX512BW-NEXT: vzeroupper
773 ; AVX512BW-NEXT: retq
775 ; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
776 ; AVX512BWVL: # %bb.0:
777 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
778 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
779 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
780 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm2
781 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
782 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
783 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
784 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
785 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
786 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
787 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
788 ; AVX512BWVL-NEXT: vzeroupper
789 ; AVX512BWVL-NEXT: retq
791 ; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
792 ; AVX512VBMI: # %bb.0:
793 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62]
794 ; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
795 ; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
796 ; AVX512VBMI-NEXT: vzeroupper
797 ; AVX512VBMI-NEXT: retq
799 ; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
800 ; AVX512VBMIVL: # %bb.0:
801 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62]
802 ; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
803 ; AVX512VBMIVL-NEXT: vpermt2b %ymm2, %ymm1, %ymm0
804 ; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
805 ; AVX512VBMIVL-NEXT: vzeroupper
806 ; AVX512VBMIVL-NEXT: retq
807 %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 62>
811 define <4 x double> @PR34175(<32 x i16>* %p) {
812 ; AVX512F-LABEL: PR34175:
814 ; AVX512F-NEXT: vmovdqu (%rdi), %xmm0
815 ; AVX512F-NEXT: vmovdqu 32(%rdi), %xmm1
816 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
817 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
818 ; AVX512F-NEXT: vpbroadcastd %xmm1, %xmm1
819 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
820 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
821 ; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0
824 ; AVX512VL-LABEL: PR34175:
826 ; AVX512VL-NEXT: vmovdqu (%rdi), %xmm0
827 ; AVX512VL-NEXT: vmovdqu 32(%rdi), %xmm1
828 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
829 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
830 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,2,3]
831 ; AVX512VL-NEXT: vpermi2d %xmm1, %xmm0, %xmm2
832 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
833 ; AVX512VL-NEXT: vcvtdq2pd %xmm0, %ymm0
834 ; AVX512VL-NEXT: retq
836 ; AVX512BW-LABEL: PR34175:
838 ; AVX512BW-NEXT: vmovdqu (%rdi), %xmm0
839 ; AVX512BW-NEXT: vmovdqu 32(%rdi), %xmm1
840 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
841 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
842 ; AVX512BW-NEXT: vpbroadcastd %xmm1, %xmm1
843 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
844 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
845 ; AVX512BW-NEXT: vcvtdq2pd %xmm0, %ymm0
846 ; AVX512BW-NEXT: retq
848 ; AVX512BWVL-LABEL: PR34175:
849 ; AVX512BWVL: # %bb.0:
850 ; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0
851 ; AVX512BWVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6755468161056768,6755468161056768,6755468161056768,6755468161056768]
852 ; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
853 ; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
854 ; AVX512BWVL-NEXT: vcvtdq2pd %xmm0, %ymm0
855 ; AVX512BWVL-NEXT: retq
857 ; AVX512VBMI-LABEL: PR34175:
858 ; AVX512VBMI: # %bb.0:
859 ; AVX512VBMI-NEXT: vmovdqu (%rdi), %xmm0
860 ; AVX512VBMI-NEXT: vmovdqu 32(%rdi), %xmm1
861 ; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
862 ; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
863 ; AVX512VBMI-NEXT: vpbroadcastd %xmm1, %xmm1
864 ; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
865 ; AVX512VBMI-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
866 ; AVX512VBMI-NEXT: vcvtdq2pd %xmm0, %ymm0
867 ; AVX512VBMI-NEXT: retq
869 ; AVX512VBMIVL-LABEL: PR34175:
870 ; AVX512VBMIVL: # %bb.0:
871 ; AVX512VBMIVL-NEXT: vmovdqu (%rdi), %ymm0
872 ; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6755468161056768,6755468161056768,6755468161056768,6755468161056768]
873 ; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
874 ; AVX512VBMIVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
875 ; AVX512VBMIVL-NEXT: vcvtdq2pd %xmm0, %ymm0
876 ; AVX512VBMIVL-NEXT: retq
877 %v = load <32 x i16>, <32 x i16>* %p, align 2
878 %shuf = shufflevector <32 x i16> %v, <32 x i16> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
879 %tofp = uitofp <4 x i16> %shuf to <4 x double>
880 ret <4 x double> %tofp
883 define <16 x i8> @trunc_v8i64_to_v8i8_return_v16i8(<8 x i64> %vec) nounwind {
884 ; AVX512-LABEL: trunc_v8i64_to_v8i8_return_v16i8:
886 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
887 ; AVX512-NEXT: vzeroupper
889 %truncated = trunc <8 x i64> %vec to <8 x i8>
890 %result = shufflevector <8 x i8> %truncated, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
891 ret <16 x i8> %result