1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMIVL
10 ; Pairs of shufflevector:trunc functions with functional equivalence.
11 ; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
13 define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
14 ; AVX512F-LABEL: shuffle_v64i8_to_v32i8:
16 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
17 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
18 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
19 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
20 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
21 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
22 ; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
23 ; AVX512F-NEXT: vzeroupper
26 ; AVX512VL-LABEL: shuffle_v64i8_to_v32i8:
28 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
29 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
30 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
31 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
32 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
33 ; AVX512VL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
34 ; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi)
35 ; AVX512VL-NEXT: vzeroupper
38 ; AVX512BW-LABEL: shuffle_v64i8_to_v32i8:
40 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
41 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1
42 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
43 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
44 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
45 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
46 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
47 ; AVX512BW-NEXT: vzeroupper
50 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8:
51 ; AVX512BWVL: # %bb.0:
52 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
53 ; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %ymm1
54 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
55 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
56 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
57 ; AVX512BWVL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
58 ; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi)
59 ; AVX512BWVL-NEXT: vzeroupper
60 ; AVX512BWVL-NEXT: retq
62 ; AVX512VBMI-LABEL: shuffle_v64i8_to_v32i8:
63 ; AVX512VBMI: # %bb.0:
64 ; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm0
65 ; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %ymm1
66 ; AVX512VBMI-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
67 ; AVX512VBMI-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
68 ; AVX512VBMI-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
69 ; AVX512VBMI-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
70 ; AVX512VBMI-NEXT: vmovdqa %ymm0, (%rsi)
71 ; AVX512VBMI-NEXT: vzeroupper
72 ; AVX512VBMI-NEXT: retq
74 ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v32i8:
75 ; AVX512VBMIVL: # %bb.0:
76 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
77 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62]
78 ; AVX512VBMIVL-NEXT: vpermi2b 32(%rdi), %ymm0, %ymm1
79 ; AVX512VBMIVL-NEXT: vmovdqa %ymm1, (%rsi)
80 ; AVX512VBMIVL-NEXT: vzeroupper
81 ; AVX512VBMIVL-NEXT: retq
82 %vec = load <64 x i8>, <64 x i8>* %L
83 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
84 store <32 x i8> %strided.vec, <32 x i8>* %S
88 define void @trunc_v32i16_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
89 ; AVX512F-LABEL: trunc_v32i16_to_v32i8:
91 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
92 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
93 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
94 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
95 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
96 ; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
97 ; AVX512F-NEXT: vzeroupper
100 ; AVX512VL-LABEL: trunc_v32i16_to_v32i8:
102 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
103 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
104 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
105 ; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1
106 ; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
107 ; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi)
108 ; AVX512VL-NEXT: vzeroupper
109 ; AVX512VL-NEXT: retq
111 ; AVX512BW-LABEL: trunc_v32i16_to_v32i8:
113 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
114 ; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi)
115 ; AVX512BW-NEXT: vzeroupper
116 ; AVX512BW-NEXT: retq
118 ; AVX512BWVL-LABEL: trunc_v32i16_to_v32i8:
119 ; AVX512BWVL: # %bb.0:
120 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
121 ; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi)
122 ; AVX512BWVL-NEXT: vzeroupper
123 ; AVX512BWVL-NEXT: retq
125 ; AVX512VBMI-LABEL: trunc_v32i16_to_v32i8:
126 ; AVX512VBMI: # %bb.0:
127 ; AVX512VBMI-NEXT: vmovdqa64 (%rdi), %zmm0
128 ; AVX512VBMI-NEXT: vpmovwb %zmm0, (%rsi)
129 ; AVX512VBMI-NEXT: vzeroupper
130 ; AVX512VBMI-NEXT: retq
132 ; AVX512VBMIVL-LABEL: trunc_v32i16_to_v32i8:
133 ; AVX512VBMIVL: # %bb.0:
134 ; AVX512VBMIVL-NEXT: vmovdqa64 (%rdi), %zmm0
135 ; AVX512VBMIVL-NEXT: vpmovwb %zmm0, (%rsi)
136 ; AVX512VBMIVL-NEXT: vzeroupper
137 ; AVX512VBMIVL-NEXT: retq
138 %vec = load <64 x i8>, <64 x i8>* %L
139 %bc = bitcast <64 x i8> %vec to <32 x i16>
140 %strided.vec = trunc <32 x i16> %bc to <32 x i8>
141 store <32 x i8> %strided.vec, <32 x i8>* %S
145 define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
146 ; AVX512F-LABEL: shuffle_v32i16_to_v16i16:
148 ; AVX512F-NEXT: vpshuflw {{.*#+}} ymm0 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
149 ; AVX512F-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
150 ; AVX512F-NEXT: vpshuflw {{.*#+}} ymm1 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
151 ; AVX512F-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
152 ; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
153 ; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
154 ; AVX512F-NEXT: vmovaps %ymm0, (%rsi)
155 ; AVX512F-NEXT: vzeroupper
158 ; AVX512VL-LABEL: shuffle_v32i16_to_v16i16:
160 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
161 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
162 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31]
163 ; AVX512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1
164 ; AVX512VL-NEXT: vpshufb %ymm2, %ymm0, %ymm0
165 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14]
166 ; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
167 ; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi)
168 ; AVX512VL-NEXT: vzeroupper
169 ; AVX512VL-NEXT: retq
171 ; AVX512BW-LABEL: shuffle_v32i16_to_v16i16:
173 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
174 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1
175 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31]
176 ; AVX512BW-NEXT: vpshufb %ymm2, %ymm1, %ymm1
177 ; AVX512BW-NEXT: vpshufb %ymm2, %ymm0, %ymm0
178 ; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
179 ; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
180 ; AVX512BW-NEXT: vmovaps %ymm0, (%rsi)
181 ; AVX512BW-NEXT: vzeroupper
182 ; AVX512BW-NEXT: retq
184 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16:
185 ; AVX512BWVL: # %bb.0:
186 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
187 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
188 ; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
189 ; AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi)
190 ; AVX512BWVL-NEXT: vzeroupper
191 ; AVX512BWVL-NEXT: retq
193 ; AVX512VBMI-LABEL: shuffle_v32i16_to_v16i16:
194 ; AVX512VBMI: # %bb.0:
195 ; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm0
196 ; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %ymm1
197 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31]
198 ; AVX512VBMI-NEXT: vpshufb %ymm2, %ymm1, %ymm1
199 ; AVX512VBMI-NEXT: vpshufb %ymm2, %ymm0, %ymm0
200 ; AVX512VBMI-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
201 ; AVX512VBMI-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
202 ; AVX512VBMI-NEXT: vmovaps %ymm0, (%rsi)
203 ; AVX512VBMI-NEXT: vzeroupper
204 ; AVX512VBMI-NEXT: retq
206 ; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v16i16:
207 ; AVX512VBMIVL: # %bb.0:
208 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
209 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
210 ; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
211 ; AVX512VBMIVL-NEXT: vmovdqa %ymm1, (%rsi)
212 ; AVX512VBMIVL-NEXT: vzeroupper
213 ; AVX512VBMIVL-NEXT: retq
214 %vec = load <32 x i16>, <32 x i16>* %L
215 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
216 store <16 x i16> %strided.vec, <16 x i16>* %S
220 define void @trunc_v16i32_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
221 ; AVX512-LABEL: trunc_v16i32_to_v16i16:
223 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
224 ; AVX512-NEXT: vpmovdw %zmm0, (%rsi)
225 ; AVX512-NEXT: vzeroupper
227 %vec = load <32 x i16>, <32 x i16>* %L
228 %bc = bitcast <32 x i16> %vec to <16 x i32>
229 %strided.vec = trunc <16 x i32> %bc to <16 x i16>
230 store <16 x i16> %strided.vec, <16 x i16>* %S
234 define void @shuffle_v16i32_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
235 ; AVX512F-LABEL: shuffle_v16i32_to_v8i32:
237 ; AVX512F-NEXT: vmovaps (%rdi), %ymm0
238 ; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6]
239 ; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
240 ; AVX512F-NEXT: vmovaps %ymm0, (%rsi)
241 ; AVX512F-NEXT: vzeroupper
244 ; AVX512VL-LABEL: shuffle_v16i32_to_v8i32:
246 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
247 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14]
248 ; AVX512VL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1
249 ; AVX512VL-NEXT: vmovdqa %ymm1, (%rsi)
250 ; AVX512VL-NEXT: vzeroupper
251 ; AVX512VL-NEXT: retq
253 ; AVX512BW-LABEL: shuffle_v16i32_to_v8i32:
255 ; AVX512BW-NEXT: vmovaps (%rdi), %ymm0
256 ; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6]
257 ; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
258 ; AVX512BW-NEXT: vmovaps %ymm0, (%rsi)
259 ; AVX512BW-NEXT: vzeroupper
260 ; AVX512BW-NEXT: retq
262 ; AVX512BWVL-LABEL: shuffle_v16i32_to_v8i32:
263 ; AVX512BWVL: # %bb.0:
264 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
265 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14]
266 ; AVX512BWVL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1
267 ; AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi)
268 ; AVX512BWVL-NEXT: vzeroupper
269 ; AVX512BWVL-NEXT: retq
271 ; AVX512VBMI-LABEL: shuffle_v16i32_to_v8i32:
272 ; AVX512VBMI: # %bb.0:
273 ; AVX512VBMI-NEXT: vmovaps (%rdi), %ymm0
274 ; AVX512VBMI-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6]
275 ; AVX512VBMI-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
276 ; AVX512VBMI-NEXT: vmovaps %ymm0, (%rsi)
277 ; AVX512VBMI-NEXT: vzeroupper
278 ; AVX512VBMI-NEXT: retq
280 ; AVX512VBMIVL-LABEL: shuffle_v16i32_to_v8i32:
281 ; AVX512VBMIVL: # %bb.0:
282 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
283 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14]
284 ; AVX512VBMIVL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1
285 ; AVX512VBMIVL-NEXT: vmovdqa %ymm1, (%rsi)
286 ; AVX512VBMIVL-NEXT: vzeroupper
287 ; AVX512VBMIVL-NEXT: retq
288 %vec = load <16 x i32>, <16 x i32>* %L
289 %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
290 store <8 x i32> %strided.vec, <8 x i32>* %S
294 define void @trunc_v8i64_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
295 ; AVX512-LABEL: trunc_v8i64_to_v8i32:
297 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
298 ; AVX512-NEXT: vpmovqd %zmm0, (%rsi)
299 ; AVX512-NEXT: vzeroupper
301 %vec = load <16 x i32>, <16 x i32>* %L
302 %bc = bitcast <16 x i32> %vec to <8 x i64>
303 %strided.vec = trunc <8 x i64> %bc to <8 x i32>
304 store <8 x i32> %strided.vec, <8 x i32>* %S
308 define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
309 ; AVX512F-LABEL: shuffle_v64i8_to_v16i8:
311 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
312 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
313 ; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
314 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
315 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
316 ; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
317 ; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
318 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
319 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
320 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
321 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
322 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
323 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
324 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
327 ; AVX512VL-LABEL: shuffle_v64i8_to_v16i8:
329 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
330 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
331 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
332 ; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
333 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
334 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
335 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
336 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
337 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
338 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
339 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
340 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
341 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
342 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
343 ; AVX512VL-NEXT: retq
345 ; AVX512BW-LABEL: shuffle_v64i8_to_v16i8:
347 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
348 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
349 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
350 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
351 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
352 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
353 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
354 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
355 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
356 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
357 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
358 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
359 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
360 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
361 ; AVX512BW-NEXT: retq
363 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8:
364 ; AVX512BWVL: # %bb.0:
365 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
366 ; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
367 ; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2
368 ; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3
369 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
370 ; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
371 ; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
372 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
373 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
374 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
375 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
376 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
377 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
378 ; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi)
379 ; AVX512BWVL-NEXT: retq
381 ; AVX512VBMI-LABEL: shuffle_v64i8_to_v16i8:
382 ; AVX512VBMI: # %bb.0:
383 ; AVX512VBMI-NEXT: vmovdqa (%rdi), %xmm0
384 ; AVX512VBMI-NEXT: vmovdqa 16(%rdi), %xmm1
385 ; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2
386 ; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm3
387 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
388 ; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm3, %xmm3
389 ; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm2, %xmm2
390 ; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
391 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
392 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1
393 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0
394 ; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
395 ; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
396 ; AVX512VBMI-NEXT: vmovdqa %xmm0, (%rsi)
397 ; AVX512VBMI-NEXT: retq
399 ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v16i8:
400 ; AVX512VBMIVL: # %bb.0:
401 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
402 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
403 ; AVX512VBMIVL-NEXT: vpermi2b 32(%rdi), %ymm0, %ymm1
404 ; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
405 ; AVX512VBMIVL-NEXT: vzeroupper
406 ; AVX512VBMIVL-NEXT: retq
407 %vec = load <64 x i8>, <64 x i8>* %L
408 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
409 store <16 x i8> %strided.vec, <16 x i8>* %S
413 define void @trunc_v16i32_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
414 ; AVX512-LABEL: trunc_v16i32_to_v16i8:
416 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
417 ; AVX512-NEXT: vpmovdb %zmm0, (%rsi)
418 ; AVX512-NEXT: vzeroupper
420 %vec = load <64 x i8>, <64 x i8>* %L
421 %bc = bitcast <64 x i8> %vec to <16 x i32>
422 %strided.vec = trunc <16 x i32> %bc to <16 x i8>
423 store <16 x i8> %strided.vec, <16 x i8>* %S
427 define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
428 ; AVX512F-LABEL: shuffle_v32i16_to_v8i16:
430 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
431 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
432 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
433 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
434 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
435 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
436 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
437 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = mem[0,2,2,3]
438 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
439 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
440 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
441 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
444 ; AVX512VL-LABEL: shuffle_v32i16_to_v8i16:
446 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
447 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
448 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
449 ; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
450 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
451 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
452 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
453 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
454 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
455 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
456 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
457 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
458 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
459 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
460 ; AVX512VL-NEXT: retq
462 ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16:
464 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
465 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
466 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
467 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
468 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
469 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
470 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
471 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
472 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
473 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
474 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
475 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
476 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
477 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
478 ; AVX512BW-NEXT: retq
480 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16:
481 ; AVX512BWVL: # %bb.0:
482 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
483 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u>
484 ; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
485 ; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi)
486 ; AVX512BWVL-NEXT: vzeroupper
487 ; AVX512BWVL-NEXT: retq
489 ; AVX512VBMI-LABEL: shuffle_v32i16_to_v8i16:
490 ; AVX512VBMI: # %bb.0:
491 ; AVX512VBMI-NEXT: vmovdqa (%rdi), %xmm0
492 ; AVX512VBMI-NEXT: vmovdqa 16(%rdi), %xmm1
493 ; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2
494 ; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm3
495 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
496 ; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm3, %xmm3
497 ; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm2, %xmm2
498 ; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
499 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
500 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1
501 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0
502 ; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
503 ; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
504 ; AVX512VBMI-NEXT: vmovdqa %xmm0, (%rsi)
505 ; AVX512VBMI-NEXT: retq
507 ; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v8i16:
508 ; AVX512VBMIVL: # %bb.0:
509 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
510 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u>
511 ; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
512 ; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
513 ; AVX512VBMIVL-NEXT: vzeroupper
514 ; AVX512VBMIVL-NEXT: retq
515 %vec = load <32 x i16>, <32 x i16>* %L
516 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
517 store <8 x i16> %strided.vec, <8 x i16>* %S
521 define void @trunc_v8i64_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
522 ; AVX512-LABEL: trunc_v8i64_to_v8i16:
524 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
525 ; AVX512-NEXT: vpmovqw %zmm0, (%rsi)
526 ; AVX512-NEXT: vzeroupper
528 %vec = load <32 x i16>, <32 x i16>* %L
529 %bc = bitcast <32 x i16> %vec to <8 x i64>
530 %strided.vec = trunc <8 x i64> %bc to <8 x i16>
531 store <8 x i16> %strided.vec, <8 x i16>* %S
535 define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
536 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8:
538 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
539 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
540 ; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
541 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
542 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
543 ; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
544 ; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
545 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
546 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
547 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
548 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
549 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
550 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
551 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
554 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8:
556 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
557 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
558 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
559 ; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
560 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
561 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
562 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
563 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
564 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
565 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
566 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
567 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
568 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
569 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
570 ; AVX512VL-NEXT: retq
572 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8:
574 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
575 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
576 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
577 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
578 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
579 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
580 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
581 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
582 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
583 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
584 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
585 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
586 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
587 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
588 ; AVX512BW-NEXT: retq
590 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8:
591 ; AVX512BWVL: # %bb.0:
592 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
593 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u>
594 ; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
595 ; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi)
596 ; AVX512BWVL-NEXT: vzeroupper
597 ; AVX512BWVL-NEXT: retq
599 ; AVX512VBMI-LABEL: shuffle_v64i8_to_v8i8:
600 ; AVX512VBMI: # %bb.0:
601 ; AVX512VBMI-NEXT: vmovdqa (%rdi), %xmm0
602 ; AVX512VBMI-NEXT: vmovdqa 16(%rdi), %xmm1
603 ; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2
604 ; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm3
605 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
606 ; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm3, %xmm3
607 ; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm2, %xmm2
608 ; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
609 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
610 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1
611 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0
612 ; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
613 ; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
614 ; AVX512VBMI-NEXT: vmovq %xmm0, (%rsi)
615 ; AVX512VBMI-NEXT: retq
617 ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v8i8:
618 ; AVX512VBMIVL: # %bb.0:
619 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
620 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u>
621 ; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
622 ; AVX512VBMIVL-NEXT: vpmovwb %xmm1, (%rsi)
623 ; AVX512VBMIVL-NEXT: vzeroupper
624 ; AVX512VBMIVL-NEXT: retq
625 %vec = load <64 x i8>, <64 x i8>* %L
626 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
627 store <8 x i8> %strided.vec, <8 x i8>* %S
631 define void @trunc_v8i64_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
632 ; AVX512-LABEL: trunc_v8i64_to_v8i8:
634 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
635 ; AVX512-NEXT: vpmovqb %zmm0, (%rsi)
636 ; AVX512-NEXT: vzeroupper
638 %vec = load <64 x i8>, <64 x i8>* %L
639 %bc = bitcast <64 x i8> %vec to <8 x i64>
640 %strided.vec = trunc <8 x i64> %bc to <8 x i8>
641 store <8 x i8> %strided.vec, <8 x i8>* %S
645 define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61(<64 x i8> %x) {
646 ; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
648 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
649 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
650 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
651 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
652 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
653 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
654 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
655 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
656 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
657 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
658 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
659 ; AVX512F-NEXT: vzeroupper
662 ; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
664 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
665 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
666 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
667 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
668 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
669 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
670 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
671 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
672 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
673 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
674 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
675 ; AVX512VL-NEXT: vzeroupper
676 ; AVX512VL-NEXT: retq
678 ; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
680 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
681 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
682 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
683 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
684 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
685 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
686 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
687 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
688 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
689 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
690 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
691 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
692 ; AVX512BW-NEXT: vzeroupper
693 ; AVX512BW-NEXT: retq
695 ; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
696 ; AVX512BWVL: # %bb.0:
697 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
698 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
699 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
700 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
701 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
702 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
703 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
704 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
705 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
706 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
707 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
708 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
709 ; AVX512BWVL-NEXT: vzeroupper
710 ; AVX512BWVL-NEXT: retq
712 ; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
713 ; AVX512VBMI: # %bb.0:
714 ; AVX512VBMI-NEXT: vextracti64x4 $1, %zmm0, %ymm1
715 ; AVX512VBMI-NEXT: vextracti128 $1, %ymm1, %xmm2
716 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
717 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm2, %xmm2
718 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1
719 ; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
720 ; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm2
721 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
722 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm2, %xmm2
723 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0
724 ; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
725 ; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
726 ; AVX512VBMI-NEXT: vzeroupper
727 ; AVX512VBMI-NEXT: retq
729 ; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
730 ; AVX512VBMIVL: # %bb.0:
731 ; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
732 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
733 ; AVX512VBMIVL-NEXT: vpermi2b %ymm2, %ymm0, %ymm1
734 ; AVX512VBMIVL-NEXT: vmovdqa %xmm1, %xmm0
735 ; AVX512VBMIVL-NEXT: vzeroupper
736 ; AVX512VBMIVL-NEXT: retq
737 %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
741 define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62(<64 x i8> %x) {
742 ; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
744 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
745 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
746 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
747 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
748 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
749 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
750 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
751 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
752 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
753 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
754 ; AVX512F-NEXT: vzeroupper
757 ; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
759 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
760 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
761 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
762 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
763 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
764 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
765 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
766 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
767 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
768 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
769 ; AVX512VL-NEXT: vzeroupper
770 ; AVX512VL-NEXT: retq
772 ; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
774 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
775 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
776 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
777 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm2
778 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
779 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0
780 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
781 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
782 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
783 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
784 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
785 ; AVX512BW-NEXT: vzeroupper
786 ; AVX512BW-NEXT: retq
788 ; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
789 ; AVX512BWVL: # %bb.0:
790 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
791 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
792 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
793 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm2
794 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
795 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
796 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
797 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
798 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
799 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
800 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
801 ; AVX512BWVL-NEXT: vzeroupper
802 ; AVX512BWVL-NEXT: retq
804 ; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
805 ; AVX512VBMI: # %bb.0:
806 ; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1
807 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
808 ; AVX512VBMI-NEXT: vpshufb %xmm2, %xmm1, %xmm1
809 ; AVX512VBMI-NEXT: vpshufb %xmm2, %xmm0, %xmm2
810 ; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
811 ; AVX512VBMI-NEXT: vextracti64x4 $1, %zmm0, %ymm0
812 ; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm2
813 ; AVX512VBMI-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
814 ; AVX512VBMI-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
815 ; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
816 ; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
817 ; AVX512VBMI-NEXT: vzeroupper
818 ; AVX512VBMI-NEXT: retq
820 ; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
821 ; AVX512VBMIVL: # %bb.0:
822 ; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
823 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
824 ; AVX512VBMIVL-NEXT: vpermi2b %ymm2, %ymm0, %ymm1
825 ; AVX512VBMIVL-NEXT: vmovdqa %xmm1, %xmm0
826 ; AVX512VBMIVL-NEXT: vzeroupper
827 ; AVX512VBMIVL-NEXT: retq
828 %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 62>
832 define <4 x double> @PR34175(<32 x i16>* %p) {
833 ; AVX512F-LABEL: PR34175:
835 ; AVX512F-NEXT: vmovdqu (%rdi), %xmm0
836 ; AVX512F-NEXT: vmovdqu 32(%rdi), %xmm1
837 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
838 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
839 ; AVX512F-NEXT: vpbroadcastd %xmm1, %xmm1
840 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
841 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
842 ; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0
845 ; AVX512VL-LABEL: PR34175:
847 ; AVX512VL-NEXT: vmovdqu (%rdi), %xmm0
848 ; AVX512VL-NEXT: vmovdqu 32(%rdi), %xmm1
849 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
850 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
851 ; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1
852 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
853 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
854 ; AVX512VL-NEXT: vcvtdq2pd %xmm0, %ymm0
855 ; AVX512VL-NEXT: retq
857 ; AVX512BW-LABEL: PR34175:
859 ; AVX512BW-NEXT: vmovdqu (%rdi), %xmm0
860 ; AVX512BW-NEXT: vmovdqu 32(%rdi), %xmm1
861 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
862 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
863 ; AVX512BW-NEXT: vpbroadcastd %xmm1, %xmm1
864 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
865 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
866 ; AVX512BW-NEXT: vcvtdq2pd %xmm0, %ymm0
867 ; AVX512BW-NEXT: retq
869 ; AVX512BWVL-LABEL: PR34175:
870 ; AVX512BWVL: # %bb.0:
871 ; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0
872 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,8,16,24,u,u,u,u,u,u,u,u,u,u,u,u>
873 ; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
874 ; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
875 ; AVX512BWVL-NEXT: vcvtdq2pd %xmm0, %ymm0
876 ; AVX512BWVL-NEXT: retq
878 ; AVX512VBMI-LABEL: PR34175:
879 ; AVX512VBMI: # %bb.0:
880 ; AVX512VBMI-NEXT: vmovdqu (%rdi), %xmm0
881 ; AVX512VBMI-NEXT: vmovdqu 32(%rdi), %xmm1
882 ; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
883 ; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
884 ; AVX512VBMI-NEXT: vpbroadcastd %xmm1, %xmm1
885 ; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
886 ; AVX512VBMI-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
887 ; AVX512VBMI-NEXT: vcvtdq2pd %xmm0, %ymm0
888 ; AVX512VBMI-NEXT: retq
890 ; AVX512VBMIVL-LABEL: PR34175:
891 ; AVX512VBMIVL: # %bb.0:
892 ; AVX512VBMIVL-NEXT: vmovdqu (%rdi), %ymm0
893 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,8,16,24,u,u,u,u,u,u,u,u,u,u,u,u>
894 ; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
895 ; AVX512VBMIVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
896 ; AVX512VBMIVL-NEXT: vcvtdq2pd %xmm0, %ymm0
897 ; AVX512VBMIVL-NEXT: retq
898 %v = load <32 x i16>, <32 x i16>* %p, align 2
899 %shuf = shufflevector <32 x i16> %v, <32 x i16> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
900 %tofp = uitofp <4 x i16> %shuf to <4 x double>
901 ret <4 x double> %tofp
904 define <16 x i8> @trunc_v8i64_to_v8i8_return_v16i8(<8 x i64> %vec) nounwind {
905 ; AVX512-LABEL: trunc_v8i64_to_v8i8_return_v16i8:
907 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
908 ; AVX512-NEXT: vzeroupper
910 %truncated = trunc <8 x i64> %vec to <8 x i8>
911 %result = shufflevector <8 x i8> %truncated, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
912 ret <16 x i8> %result