1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMIVL
10 ; Pairs of shufflevector:trunc functions with functional equivalence.
11 ; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
13 define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
14 ; AVX512F-LABEL: shuffle_v64i8_to_v32i8:
16 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
17 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
18 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
19 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
20 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
21 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
22 ; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
23 ; AVX512F-NEXT: vzeroupper
26 ; AVX512VL-LABEL: shuffle_v64i8_to_v32i8:
28 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
29 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
30 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
31 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
32 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
33 ; AVX512VL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
34 ; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi)
35 ; AVX512VL-NEXT: vzeroupper
38 ; AVX512BW-LABEL: shuffle_v64i8_to_v32i8:
40 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
41 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1
42 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
43 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
44 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
45 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
46 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
47 ; AVX512BW-NEXT: vzeroupper
50 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8:
51 ; AVX512BWVL: # %bb.0:
52 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
53 ; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %ymm1
54 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
55 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
56 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
57 ; AVX512BWVL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
58 ; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi)
59 ; AVX512BWVL-NEXT: vzeroupper
60 ; AVX512BWVL-NEXT: retq
62 ; AVX512VBMI-LABEL: shuffle_v64i8_to_v32i8:
63 ; AVX512VBMI: # %bb.0:
64 ; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm0
65 ; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %ymm1
66 ; AVX512VBMI-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
67 ; AVX512VBMI-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
68 ; AVX512VBMI-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
69 ; AVX512VBMI-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
70 ; AVX512VBMI-NEXT: vmovdqa %ymm0, (%rsi)
71 ; AVX512VBMI-NEXT: vzeroupper
72 ; AVX512VBMI-NEXT: retq
74 ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v32i8:
75 ; AVX512VBMIVL: # %bb.0:
76 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
77 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62]
78 ; AVX512VBMIVL-NEXT: vpermi2b 32(%rdi), %ymm0, %ymm1
79 ; AVX512VBMIVL-NEXT: vmovdqa %ymm1, (%rsi)
80 ; AVX512VBMIVL-NEXT: vzeroupper
81 ; AVX512VBMIVL-NEXT: retq
82 %vec = load <64 x i8>, <64 x i8>* %L
83 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
84 store <32 x i8> %strided.vec, <32 x i8>* %S
88 define void @trunc_v32i16_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
89 ; AVX512F-LABEL: trunc_v32i16_to_v32i8:
91 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
92 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
93 ; AVX512F-NEXT: vpmovdb %zmm1, 16(%rsi)
94 ; AVX512F-NEXT: vpmovdb %zmm0, (%rsi)
95 ; AVX512F-NEXT: vzeroupper
98 ; AVX512VL-LABEL: trunc_v32i16_to_v32i8:
100 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
101 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
102 ; AVX512VL-NEXT: vpmovdb %zmm1, 16(%rsi)
103 ; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi)
104 ; AVX512VL-NEXT: vzeroupper
105 ; AVX512VL-NEXT: retq
107 ; AVX512BW-LABEL: trunc_v32i16_to_v32i8:
109 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
110 ; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi)
111 ; AVX512BW-NEXT: vzeroupper
112 ; AVX512BW-NEXT: retq
114 ; AVX512BWVL-LABEL: trunc_v32i16_to_v32i8:
115 ; AVX512BWVL: # %bb.0:
116 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
117 ; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi)
118 ; AVX512BWVL-NEXT: vzeroupper
119 ; AVX512BWVL-NEXT: retq
121 ; AVX512VBMI-LABEL: trunc_v32i16_to_v32i8:
122 ; AVX512VBMI: # %bb.0:
123 ; AVX512VBMI-NEXT: vmovdqa64 (%rdi), %zmm0
124 ; AVX512VBMI-NEXT: vpmovwb %zmm0, (%rsi)
125 ; AVX512VBMI-NEXT: vzeroupper
126 ; AVX512VBMI-NEXT: retq
128 ; AVX512VBMIVL-LABEL: trunc_v32i16_to_v32i8:
129 ; AVX512VBMIVL: # %bb.0:
130 ; AVX512VBMIVL-NEXT: vmovdqa64 (%rdi), %zmm0
131 ; AVX512VBMIVL-NEXT: vpmovwb %zmm0, (%rsi)
132 ; AVX512VBMIVL-NEXT: vzeroupper
133 ; AVX512VBMIVL-NEXT: retq
134 %vec = load <64 x i8>, <64 x i8>* %L
135 %bc = bitcast <64 x i8> %vec to <32 x i16>
136 %strided.vec = trunc <32 x i16> %bc to <32 x i8>
137 store <32 x i8> %strided.vec, <32 x i8>* %S
141 define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
142 ; AVX512F-LABEL: shuffle_v32i16_to_v16i16:
144 ; AVX512F-NEXT: vpshuflw {{.*#+}} ymm0 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
145 ; AVX512F-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
146 ; AVX512F-NEXT: vpshuflw {{.*#+}} ymm1 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
147 ; AVX512F-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
148 ; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
149 ; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
150 ; AVX512F-NEXT: vmovaps %ymm0, (%rsi)
151 ; AVX512F-NEXT: vzeroupper
154 ; AVX512VL-LABEL: shuffle_v32i16_to_v16i16:
156 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
157 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
158 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31]
159 ; AVX512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1
160 ; AVX512VL-NEXT: vpshufb %ymm2, %ymm0, %ymm0
161 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14]
162 ; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
163 ; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi)
164 ; AVX512VL-NEXT: vzeroupper
165 ; AVX512VL-NEXT: retq
167 ; AVX512BW-LABEL: shuffle_v32i16_to_v16i16:
169 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
170 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1
171 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31]
172 ; AVX512BW-NEXT: vpshufb %ymm2, %ymm1, %ymm1
173 ; AVX512BW-NEXT: vpshufb %ymm2, %ymm0, %ymm0
174 ; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
175 ; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
176 ; AVX512BW-NEXT: vmovaps %ymm0, (%rsi)
177 ; AVX512BW-NEXT: vzeroupper
178 ; AVX512BW-NEXT: retq
180 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16:
181 ; AVX512BWVL: # %bb.0:
182 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
183 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
184 ; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
185 ; AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi)
186 ; AVX512BWVL-NEXT: vzeroupper
187 ; AVX512BWVL-NEXT: retq
189 ; AVX512VBMI-LABEL: shuffle_v32i16_to_v16i16:
190 ; AVX512VBMI: # %bb.0:
191 ; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm0
192 ; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %ymm1
193 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31]
194 ; AVX512VBMI-NEXT: vpshufb %ymm2, %ymm1, %ymm1
195 ; AVX512VBMI-NEXT: vpshufb %ymm2, %ymm0, %ymm0
196 ; AVX512VBMI-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
197 ; AVX512VBMI-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
198 ; AVX512VBMI-NEXT: vmovaps %ymm0, (%rsi)
199 ; AVX512VBMI-NEXT: vzeroupper
200 ; AVX512VBMI-NEXT: retq
202 ; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v16i16:
203 ; AVX512VBMIVL: # %bb.0:
204 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
205 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
206 ; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
207 ; AVX512VBMIVL-NEXT: vmovdqa %ymm1, (%rsi)
208 ; AVX512VBMIVL-NEXT: vzeroupper
209 ; AVX512VBMIVL-NEXT: retq
210 %vec = load <32 x i16>, <32 x i16>* %L
211 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
212 store <16 x i16> %strided.vec, <16 x i16>* %S
216 define void @trunc_v16i32_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
217 ; AVX512-LABEL: trunc_v16i32_to_v16i16:
219 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
220 ; AVX512-NEXT: vpmovdw %zmm0, (%rsi)
221 ; AVX512-NEXT: vzeroupper
223 %vec = load <32 x i16>, <32 x i16>* %L
224 %bc = bitcast <32 x i16> %vec to <16 x i32>
225 %strided.vec = trunc <16 x i32> %bc to <16 x i16>
226 store <16 x i16> %strided.vec, <16 x i16>* %S
230 define void @shuffle_v16i32_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
231 ; AVX512F-LABEL: shuffle_v16i32_to_v8i32:
233 ; AVX512F-NEXT: vmovaps (%rdi), %ymm0
234 ; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6]
235 ; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
236 ; AVX512F-NEXT: vmovaps %ymm0, (%rsi)
237 ; AVX512F-NEXT: vzeroupper
240 ; AVX512VL-LABEL: shuffle_v16i32_to_v8i32:
242 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
243 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14]
244 ; AVX512VL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1
245 ; AVX512VL-NEXT: vmovdqa %ymm1, (%rsi)
246 ; AVX512VL-NEXT: vzeroupper
247 ; AVX512VL-NEXT: retq
249 ; AVX512BW-LABEL: shuffle_v16i32_to_v8i32:
251 ; AVX512BW-NEXT: vmovaps (%rdi), %ymm0
252 ; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6]
253 ; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
254 ; AVX512BW-NEXT: vmovaps %ymm0, (%rsi)
255 ; AVX512BW-NEXT: vzeroupper
256 ; AVX512BW-NEXT: retq
258 ; AVX512BWVL-LABEL: shuffle_v16i32_to_v8i32:
259 ; AVX512BWVL: # %bb.0:
260 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
261 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14]
262 ; AVX512BWVL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1
263 ; AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi)
264 ; AVX512BWVL-NEXT: vzeroupper
265 ; AVX512BWVL-NEXT: retq
267 ; AVX512VBMI-LABEL: shuffle_v16i32_to_v8i32:
268 ; AVX512VBMI: # %bb.0:
269 ; AVX512VBMI-NEXT: vmovaps (%rdi), %ymm0
270 ; AVX512VBMI-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6]
271 ; AVX512VBMI-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
272 ; AVX512VBMI-NEXT: vmovaps %ymm0, (%rsi)
273 ; AVX512VBMI-NEXT: vzeroupper
274 ; AVX512VBMI-NEXT: retq
276 ; AVX512VBMIVL-LABEL: shuffle_v16i32_to_v8i32:
277 ; AVX512VBMIVL: # %bb.0:
278 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
279 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14]
280 ; AVX512VBMIVL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1
281 ; AVX512VBMIVL-NEXT: vmovdqa %ymm1, (%rsi)
282 ; AVX512VBMIVL-NEXT: vzeroupper
283 ; AVX512VBMIVL-NEXT: retq
284 %vec = load <16 x i32>, <16 x i32>* %L
285 %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
286 store <8 x i32> %strided.vec, <8 x i32>* %S
290 define void @trunc_v8i64_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
291 ; AVX512-LABEL: trunc_v8i64_to_v8i32:
293 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
294 ; AVX512-NEXT: vpmovqd %zmm0, (%rsi)
295 ; AVX512-NEXT: vzeroupper
297 %vec = load <16 x i32>, <16 x i32>* %L
298 %bc = bitcast <16 x i32> %vec to <8 x i64>
299 %strided.vec = trunc <8 x i64> %bc to <8 x i32>
300 store <8 x i32> %strided.vec, <8 x i32>* %S
304 define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
305 ; AVX512F-LABEL: shuffle_v64i8_to_v16i8:
307 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
308 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
309 ; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
310 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
311 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
312 ; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
313 ; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
314 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
315 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
316 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
317 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
318 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
319 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
320 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
323 ; AVX512VL-LABEL: shuffle_v64i8_to_v16i8:
325 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
326 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
327 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
328 ; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
329 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
330 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
331 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
332 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
333 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
334 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
335 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
336 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
337 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
338 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
339 ; AVX512VL-NEXT: retq
341 ; AVX512BW-LABEL: shuffle_v64i8_to_v16i8:
343 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
344 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
345 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
346 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
347 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
348 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
349 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
350 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
351 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
352 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
353 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
354 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
355 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
356 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
357 ; AVX512BW-NEXT: retq
359 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8:
360 ; AVX512BWVL: # %bb.0:
361 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
362 ; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
363 ; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2
364 ; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3
365 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
366 ; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
367 ; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
368 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
369 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
370 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
371 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
372 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
373 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
374 ; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi)
375 ; AVX512BWVL-NEXT: retq
377 ; AVX512VBMI-LABEL: shuffle_v64i8_to_v16i8:
378 ; AVX512VBMI: # %bb.0:
379 ; AVX512VBMI-NEXT: vmovdqa (%rdi), %xmm0
380 ; AVX512VBMI-NEXT: vmovdqa 16(%rdi), %xmm1
381 ; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2
382 ; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm3
383 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
384 ; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm3, %xmm3
385 ; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm2, %xmm2
386 ; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
387 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
388 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1
389 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0
390 ; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
391 ; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
392 ; AVX512VBMI-NEXT: vmovdqa %xmm0, (%rsi)
393 ; AVX512VBMI-NEXT: retq
395 ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v16i8:
396 ; AVX512VBMIVL: # %bb.0:
397 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
398 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm1
399 ; AVX512VBMIVL-NEXT: vpermt2b 32(%rdi), %ymm0, %ymm1
400 ; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
401 ; AVX512VBMIVL-NEXT: vzeroupper
402 ; AVX512VBMIVL-NEXT: retq
403 %vec = load <64 x i8>, <64 x i8>* %L
404 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
405 store <16 x i8> %strided.vec, <16 x i8>* %S
409 define void @trunc_v16i32_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
410 ; AVX512-LABEL: trunc_v16i32_to_v16i8:
412 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
413 ; AVX512-NEXT: vpmovdb %zmm0, (%rsi)
414 ; AVX512-NEXT: vzeroupper
416 %vec = load <64 x i8>, <64 x i8>* %L
417 %bc = bitcast <64 x i8> %vec to <16 x i32>
418 %strided.vec = trunc <16 x i32> %bc to <16 x i8>
419 store <16 x i8> %strided.vec, <16 x i8>* %S
423 define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
424 ; AVX512F-LABEL: shuffle_v32i16_to_v8i16:
426 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
427 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
428 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
429 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
430 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
431 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
432 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
433 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = mem[0,2,2,3]
434 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
435 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
436 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
437 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
440 ; AVX512VL-LABEL: shuffle_v32i16_to_v8i16:
442 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
443 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
444 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
445 ; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
446 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
447 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
448 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
449 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
450 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
451 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
452 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
453 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
454 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
455 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
456 ; AVX512VL-NEXT: retq
458 ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16:
460 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
461 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
462 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
463 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
464 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
465 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
466 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
467 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
468 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
469 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
470 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
471 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
472 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
473 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
474 ; AVX512BW-NEXT: retq
476 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16:
477 ; AVX512BWVL: # %bb.0:
478 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28]
479 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
480 ; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
481 ; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi)
482 ; AVX512BWVL-NEXT: vzeroupper
483 ; AVX512BWVL-NEXT: retq
485 ; AVX512VBMI-LABEL: shuffle_v32i16_to_v8i16:
486 ; AVX512VBMI: # %bb.0:
487 ; AVX512VBMI-NEXT: vmovdqa (%rdi), %xmm0
488 ; AVX512VBMI-NEXT: vmovdqa 16(%rdi), %xmm1
489 ; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2
490 ; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm3
491 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
492 ; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm3, %xmm3
493 ; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm2, %xmm2
494 ; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
495 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
496 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1
497 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0
498 ; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
499 ; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
500 ; AVX512VBMI-NEXT: vmovdqa %xmm0, (%rsi)
501 ; AVX512VBMI-NEXT: retq
503 ; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v8i16:
504 ; AVX512VBMIVL: # %bb.0:
505 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28]
506 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm1
507 ; AVX512VBMIVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
508 ; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
509 ; AVX512VBMIVL-NEXT: vzeroupper
510 ; AVX512VBMIVL-NEXT: retq
511 %vec = load <32 x i16>, <32 x i16>* %L
512 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
513 store <8 x i16> %strided.vec, <8 x i16>* %S
517 define void @trunc_v8i64_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
518 ; AVX512-LABEL: trunc_v8i64_to_v8i16:
520 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
521 ; AVX512-NEXT: vpmovqw %zmm0, (%rsi)
522 ; AVX512-NEXT: vzeroupper
524 %vec = load <32 x i16>, <32 x i16>* %L
525 %bc = bitcast <32 x i16> %vec to <8 x i64>
526 %strided.vec = trunc <8 x i64> %bc to <8 x i16>
527 store <8 x i16> %strided.vec, <8 x i16>* %S
531 define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
532 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8:
534 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
535 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
536 ; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
537 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
538 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
539 ; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
540 ; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
541 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
542 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
543 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
544 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
545 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
546 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
547 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
550 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8:
552 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
553 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
554 ; AVX512VL-NEXT: vpmovqb %ymm1, %xmm1
555 ; AVX512VL-NEXT: vpmovqb %ymm0, %xmm0
556 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
557 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
558 ; AVX512VL-NEXT: vzeroupper
559 ; AVX512VL-NEXT: retq
561 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8:
563 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
564 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
565 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
566 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
567 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
568 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
569 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
570 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
571 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
572 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
573 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
574 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
575 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
576 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
577 ; AVX512BW-NEXT: retq
579 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8:
580 ; AVX512BWVL: # %bb.0:
581 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
582 ; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %ymm1
583 ; AVX512BWVL-NEXT: vpmovqb %ymm1, %xmm1
584 ; AVX512BWVL-NEXT: vpmovqb %ymm0, %xmm0
585 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
586 ; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi)
587 ; AVX512BWVL-NEXT: vzeroupper
588 ; AVX512BWVL-NEXT: retq
590 ; AVX512VBMI-LABEL: shuffle_v64i8_to_v8i8:
591 ; AVX512VBMI: # %bb.0:
592 ; AVX512VBMI-NEXT: vmovdqa (%rdi), %xmm0
593 ; AVX512VBMI-NEXT: vmovdqa 16(%rdi), %xmm1
594 ; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2
595 ; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm3
596 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
597 ; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm3, %xmm3
598 ; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm2, %xmm2
599 ; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
600 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
601 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1
602 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0
603 ; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
604 ; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
605 ; AVX512VBMI-NEXT: vmovq %xmm0, (%rsi)
606 ; AVX512VBMI-NEXT: retq
608 ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v8i8:
609 ; AVX512VBMIVL: # %bb.0:
610 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
611 ; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4048780183313844224,4048780183313844224,4048780183313844224,4048780183313844224]
612 ; AVX512VBMIVL-NEXT: vpermi2b 32(%rdi), %ymm0, %ymm1
613 ; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi)
614 ; AVX512VBMIVL-NEXT: vzeroupper
615 ; AVX512VBMIVL-NEXT: retq
616 %vec = load <64 x i8>, <64 x i8>* %L
617 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
618 store <8 x i8> %strided.vec, <8 x i8>* %S
622 define void @trunc_v8i64_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
623 ; AVX512-LABEL: trunc_v8i64_to_v8i8:
625 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
626 ; AVX512-NEXT: vpmovqb %zmm0, (%rsi)
627 ; AVX512-NEXT: vzeroupper
629 %vec = load <64 x i8>, <64 x i8>* %L
630 %bc = bitcast <64 x i8> %vec to <8 x i64>
631 %strided.vec = trunc <8 x i64> %bc to <8 x i8>
632 store <8 x i8> %strided.vec, <8 x i8>* %S
636 define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61(<64 x i8> %x) {
637 ; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
639 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
640 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
641 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
642 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
643 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
644 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
645 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
646 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
647 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
648 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
649 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
650 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
651 ; AVX512F-NEXT: vzeroupper
654 ; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
656 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
657 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
658 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
659 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
660 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
661 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
662 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
663 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
664 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
665 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
666 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
667 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
668 ; AVX512VL-NEXT: vzeroupper
669 ; AVX512VL-NEXT: retq
671 ; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
673 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
674 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
675 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
676 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
677 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
678 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
679 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
680 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
681 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
682 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
683 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
684 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
685 ; AVX512BW-NEXT: vzeroupper
686 ; AVX512BW-NEXT: retq
688 ; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
689 ; AVX512BWVL: # %bb.0:
690 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
691 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
692 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
693 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
694 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
695 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
696 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
697 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
698 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
699 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
700 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
701 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
702 ; AVX512BWVL-NEXT: vzeroupper
703 ; AVX512BWVL-NEXT: retq
705 ; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
706 ; AVX512VBMI: # %bb.0:
707 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
708 ; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
709 ; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
710 ; AVX512VBMI-NEXT: vzeroupper
711 ; AVX512VBMI-NEXT: retq
713 ; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
714 ; AVX512VBMIVL: # %bb.0:
715 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
716 ; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
717 ; AVX512VBMIVL-NEXT: vpermt2b %ymm2, %ymm1, %ymm0
718 ; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
719 ; AVX512VBMIVL-NEXT: vzeroupper
720 ; AVX512VBMIVL-NEXT: retq
721 %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
725 define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62(<64 x i8> %x) {
726 ; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
728 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
729 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
730 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
731 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
732 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
733 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
734 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
735 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
736 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
737 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
738 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
739 ; AVX512F-NEXT: vzeroupper
742 ; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
744 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
745 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
746 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
747 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
748 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
749 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
750 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
751 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
752 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
753 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
754 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
755 ; AVX512VL-NEXT: vzeroupper
756 ; AVX512VL-NEXT: retq
758 ; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
760 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
761 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
762 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
763 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm2
764 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
765 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0
766 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
767 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
768 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
769 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
770 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
771 ; AVX512BW-NEXT: vzeroupper
772 ; AVX512BW-NEXT: retq
774 ; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
775 ; AVX512BWVL: # %bb.0:
776 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
777 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
778 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
779 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm2
780 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
781 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
782 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
783 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
784 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
785 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
786 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
787 ; AVX512BWVL-NEXT: vzeroupper
788 ; AVX512BWVL-NEXT: retq
790 ; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
791 ; AVX512VBMI: # %bb.0:
792 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62]
793 ; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
794 ; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
795 ; AVX512VBMI-NEXT: vzeroupper
796 ; AVX512VBMI-NEXT: retq
798 ; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
799 ; AVX512VBMIVL: # %bb.0:
800 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62]
801 ; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
802 ; AVX512VBMIVL-NEXT: vpermt2b %ymm2, %ymm1, %ymm0
803 ; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
804 ; AVX512VBMIVL-NEXT: vzeroupper
805 ; AVX512VBMIVL-NEXT: retq
806 %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 62>
810 define <4 x double> @PR34175(<32 x i16>* %p) {
811 ; AVX512F-LABEL: PR34175:
813 ; AVX512F-NEXT: vmovdqu (%rdi), %xmm0
814 ; AVX512F-NEXT: vmovdqu 32(%rdi), %xmm1
815 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
816 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
817 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
818 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
819 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
820 ; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0
823 ; AVX512VL-LABEL: PR34175:
825 ; AVX512VL-NEXT: vmovdqu (%rdi), %xmm0
826 ; AVX512VL-NEXT: vmovdqu 32(%rdi), %xmm1
827 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
828 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
829 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,2,3]
830 ; AVX512VL-NEXT: vpermi2d %xmm1, %xmm0, %xmm2
831 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
832 ; AVX512VL-NEXT: vcvtdq2pd %xmm0, %ymm0
833 ; AVX512VL-NEXT: retq
835 ; AVX512BW-LABEL: PR34175:
837 ; AVX512BW-NEXT: vmovdqu (%rdi), %xmm0
838 ; AVX512BW-NEXT: vmovdqu 32(%rdi), %xmm1
839 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
840 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
841 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
842 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
843 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
844 ; AVX512BW-NEXT: vcvtdq2pd %xmm0, %ymm0
845 ; AVX512BW-NEXT: retq
847 ; AVX512BWVL-LABEL: PR34175:
848 ; AVX512BWVL: # %bb.0:
849 ; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0
850 ; AVX512BWVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6755468161056768,6755468161056768,6755468161056768,6755468161056768]
851 ; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
852 ; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
853 ; AVX512BWVL-NEXT: vcvtdq2pd %xmm0, %ymm0
854 ; AVX512BWVL-NEXT: retq
856 ; AVX512VBMI-LABEL: PR34175:
857 ; AVX512VBMI: # %bb.0:
858 ; AVX512VBMI-NEXT: vmovdqu (%rdi), %xmm0
859 ; AVX512VBMI-NEXT: vmovdqu 32(%rdi), %xmm1
860 ; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
861 ; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
862 ; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
863 ; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
864 ; AVX512VBMI-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
865 ; AVX512VBMI-NEXT: vcvtdq2pd %xmm0, %ymm0
866 ; AVX512VBMI-NEXT: retq
868 ; AVX512VBMIVL-LABEL: PR34175:
869 ; AVX512VBMIVL: # %bb.0:
870 ; AVX512VBMIVL-NEXT: vmovdqu (%rdi), %ymm0
871 ; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6755468161056768,6755468161056768,6755468161056768,6755468161056768]
872 ; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
873 ; AVX512VBMIVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
874 ; AVX512VBMIVL-NEXT: vcvtdq2pd %xmm0, %ymm0
875 ; AVX512VBMIVL-NEXT: retq
876 %v = load <32 x i16>, <32 x i16>* %p, align 2
877 %shuf = shufflevector <32 x i16> %v, <32 x i16> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
878 %tofp = uitofp <4 x i16> %shuf to <4 x double>
879 ret <4 x double> %tofp
882 define <16 x i8> @trunc_v8i64_to_v8i8_return_v16i8(<8 x i64> %vec) nounwind {
883 ; AVX512-LABEL: trunc_v8i64_to_v8i8_return_v16i8:
885 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
886 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
887 ; AVX512-NEXT: vzeroupper
889 %truncated = trunc <8 x i64> %vec to <8 x i8>
890 %result = shufflevector <8 x i8> %truncated, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
891 ret <16 x i8> %result