1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL,AVX512VL-FAST-ALL
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL,AVX512VL-FAST-PERLANE
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMIVL
12 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMIVL
15 ; Pairs of shufflevector:trunc functions with functional equivalence.
16 ; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
18 define void @shuffle_v64i8_to_v32i8(ptr %L, ptr %S) nounwind {
19 ; AVX512F-LABEL: shuffle_v64i8_to_v32i8:
21 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
22 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
23 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
24 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
25 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
26 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
27 ; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
28 ; AVX512F-NEXT: vzeroupper
31 ; AVX512VL-FAST-ALL-LABEL: shuffle_v64i8_to_v32i8:
32 ; AVX512VL-FAST-ALL: # %bb.0:
33 ; AVX512VL-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm0
34 ; AVX512VL-FAST-ALL-NEXT: vmovdqa 32(%rdi), %ymm1
35 ; AVX512VL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
36 ; AVX512VL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
37 ; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
38 ; AVX512VL-FAST-ALL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
39 ; AVX512VL-FAST-ALL-NEXT: vmovdqa %ymm2, (%rsi)
40 ; AVX512VL-FAST-ALL-NEXT: vzeroupper
41 ; AVX512VL-FAST-ALL-NEXT: retq
43 ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v64i8_to_v32i8:
44 ; AVX512VL-FAST-PERLANE: # %bb.0:
45 ; AVX512VL-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0
46 ; AVX512VL-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1
47 ; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
48 ; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
49 ; AVX512VL-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
50 ; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
51 ; AVX512VL-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rsi)
52 ; AVX512VL-FAST-PERLANE-NEXT: vzeroupper
53 ; AVX512VL-FAST-PERLANE-NEXT: retq
55 ; AVX512BW-LABEL: shuffle_v64i8_to_v32i8:
57 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
58 ; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi)
59 ; AVX512BW-NEXT: vzeroupper
62 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8:
63 ; AVX512BWVL: # %bb.0:
64 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
65 ; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi)
66 ; AVX512BWVL-NEXT: vzeroupper
67 ; AVX512BWVL-NEXT: retq
69 ; AVX512VBMI-LABEL: shuffle_v64i8_to_v32i8:
70 ; AVX512VBMI: # %bb.0:
71 ; AVX512VBMI-NEXT: vmovdqa64 (%rdi), %zmm0
72 ; AVX512VBMI-NEXT: vpmovwb %zmm0, (%rsi)
73 ; AVX512VBMI-NEXT: vzeroupper
74 ; AVX512VBMI-NEXT: retq
76 ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v32i8:
77 ; AVX512VBMIVL: # %bb.0:
78 ; AVX512VBMIVL-NEXT: vmovdqa64 (%rdi), %zmm0
79 ; AVX512VBMIVL-NEXT: vpmovwb %zmm0, (%rsi)
80 ; AVX512VBMIVL-NEXT: vzeroupper
81 ; AVX512VBMIVL-NEXT: retq
82 %vec = load <64 x i8>, ptr %L
83 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
84 store <32 x i8> %strided.vec, ptr %S
88 define void @trunc_v32i16_to_v32i8(ptr %L, ptr %S) nounwind {
89 ; AVX512F-LABEL: trunc_v32i16_to_v32i8:
91 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
92 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
93 ; AVX512F-NEXT: vpmovdb %zmm1, 16(%rsi)
94 ; AVX512F-NEXT: vpmovdb %zmm0, (%rsi)
95 ; AVX512F-NEXT: vzeroupper
98 ; AVX512VL-LABEL: trunc_v32i16_to_v32i8:
100 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
101 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
102 ; AVX512VL-NEXT: vpmovdb %zmm1, 16(%rsi)
103 ; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi)
104 ; AVX512VL-NEXT: vzeroupper
105 ; AVX512VL-NEXT: retq
107 ; AVX512BW-LABEL: trunc_v32i16_to_v32i8:
109 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
110 ; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi)
111 ; AVX512BW-NEXT: vzeroupper
112 ; AVX512BW-NEXT: retq
114 ; AVX512BWVL-LABEL: trunc_v32i16_to_v32i8:
115 ; AVX512BWVL: # %bb.0:
116 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
117 ; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi)
118 ; AVX512BWVL-NEXT: vzeroupper
119 ; AVX512BWVL-NEXT: retq
121 ; AVX512VBMI-LABEL: trunc_v32i16_to_v32i8:
122 ; AVX512VBMI: # %bb.0:
123 ; AVX512VBMI-NEXT: vmovdqa64 (%rdi), %zmm0
124 ; AVX512VBMI-NEXT: vpmovwb %zmm0, (%rsi)
125 ; AVX512VBMI-NEXT: vzeroupper
126 ; AVX512VBMI-NEXT: retq
128 ; AVX512VBMIVL-LABEL: trunc_v32i16_to_v32i8:
129 ; AVX512VBMIVL: # %bb.0:
130 ; AVX512VBMIVL-NEXT: vmovdqa64 (%rdi), %zmm0
131 ; AVX512VBMIVL-NEXT: vpmovwb %zmm0, (%rsi)
132 ; AVX512VBMIVL-NEXT: vzeroupper
133 ; AVX512VBMIVL-NEXT: retq
134 %vec = load <64 x i8>, ptr %L
135 %bc = bitcast <64 x i8> %vec to <32 x i16>
136 %strided.vec = trunc <32 x i16> %bc to <32 x i8>
137 store <32 x i8> %strided.vec, ptr %S
141 define void @shuffle_v32i16_to_v16i16(ptr %L, ptr %S) nounwind {
142 ; AVX512-LABEL: shuffle_v32i16_to_v16i16:
144 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
145 ; AVX512-NEXT: vpmovdw %zmm0, (%rsi)
146 ; AVX512-NEXT: vzeroupper
148 %vec = load <32 x i16>, ptr %L
149 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
150 store <16 x i16> %strided.vec, ptr %S
154 define void @trunc_v16i32_to_v16i16(ptr %L, ptr %S) nounwind {
155 ; AVX512-LABEL: trunc_v16i32_to_v16i16:
157 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
158 ; AVX512-NEXT: vpmovdw %zmm0, (%rsi)
159 ; AVX512-NEXT: vzeroupper
161 %vec = load <32 x i16>, ptr %L
162 %bc = bitcast <32 x i16> %vec to <16 x i32>
163 %strided.vec = trunc <16 x i32> %bc to <16 x i16>
164 store <16 x i16> %strided.vec, ptr %S
168 define void @shuffle_v16i32_to_v8i32(ptr %L, ptr %S) nounwind {
169 ; AVX512-LABEL: shuffle_v16i32_to_v8i32:
171 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
172 ; AVX512-NEXT: vpmovqd %zmm0, (%rsi)
173 ; AVX512-NEXT: vzeroupper
175 %vec = load <16 x i32>, ptr %L
176 %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
177 store <8 x i32> %strided.vec, ptr %S
181 define void @trunc_v8i64_to_v8i32(ptr %L, ptr %S) nounwind {
182 ; AVX512-LABEL: trunc_v8i64_to_v8i32:
184 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
185 ; AVX512-NEXT: vpmovqd %zmm0, (%rsi)
186 ; AVX512-NEXT: vzeroupper
188 %vec = load <16 x i32>, ptr %L
189 %bc = bitcast <16 x i32> %vec to <8 x i64>
190 %strided.vec = trunc <8 x i64> %bc to <8 x i32>
191 store <8 x i32> %strided.vec, ptr %S
195 define void @shuffle_v64i8_to_v16i8(ptr %L, ptr %S) nounwind {
196 ; AVX512-LABEL: shuffle_v64i8_to_v16i8:
198 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
199 ; AVX512-NEXT: vpmovdb %zmm0, (%rsi)
200 ; AVX512-NEXT: vzeroupper
202 %vec = load <64 x i8>, ptr %L
203 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
204 store <16 x i8> %strided.vec, ptr %S
208 define void @trunc_v16i32_to_v16i8(ptr %L, ptr %S) nounwind {
209 ; AVX512-LABEL: trunc_v16i32_to_v16i8:
211 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
212 ; AVX512-NEXT: vpmovdb %zmm0, (%rsi)
213 ; AVX512-NEXT: vzeroupper
215 %vec = load <64 x i8>, ptr %L
216 %bc = bitcast <64 x i8> %vec to <16 x i32>
217 %strided.vec = trunc <16 x i32> %bc to <16 x i8>
218 store <16 x i8> %strided.vec, ptr %S
222 define void @shuffle_v32i16_to_v8i16(ptr %L, ptr %S) nounwind {
223 ; AVX512-LABEL: shuffle_v32i16_to_v8i16:
225 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
226 ; AVX512-NEXT: vpmovqw %zmm0, (%rsi)
227 ; AVX512-NEXT: vzeroupper
229 %vec = load <32 x i16>, ptr %L
230 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
231 store <8 x i16> %strided.vec, ptr %S
235 define void @trunc_v8i64_to_v8i16(ptr %L, ptr %S) nounwind {
236 ; AVX512-LABEL: trunc_v8i64_to_v8i16:
238 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
239 ; AVX512-NEXT: vpmovqw %zmm0, (%rsi)
240 ; AVX512-NEXT: vzeroupper
242 %vec = load <32 x i16>, ptr %L
243 %bc = bitcast <32 x i16> %vec to <8 x i64>
244 %strided.vec = trunc <8 x i64> %bc to <8 x i16>
245 store <8 x i16> %strided.vec, ptr %S
249 define void @shuffle_v64i8_to_v8i8(ptr %L, ptr %S) nounwind {
250 ; AVX512-LABEL: shuffle_v64i8_to_v8i8:
252 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
253 ; AVX512-NEXT: vpmovqb %zmm0, (%rsi)
254 ; AVX512-NEXT: vzeroupper
256 %vec = load <64 x i8>, ptr %L
257 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
258 store <8 x i8> %strided.vec, ptr %S
262 define void @trunc_v8i64_to_v8i8(ptr %L, ptr %S) nounwind {
263 ; AVX512-LABEL: trunc_v8i64_to_v8i8:
265 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
266 ; AVX512-NEXT: vpmovqb %zmm0, (%rsi)
267 ; AVX512-NEXT: vzeroupper
269 %vec = load <64 x i8>, ptr %L
270 %bc = bitcast <64 x i8> %vec to <8 x i64>
271 %strided.vec = trunc <8 x i64> %bc to <8 x i8>
272 store <8 x i8> %strided.vec, ptr %S
276 define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61(<64 x i8> %x) {
277 ; AVX512-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
279 ; AVX512-NEXT: vpsrld $8, %zmm0, %zmm0
280 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
281 ; AVX512-NEXT: vzeroupper
283 %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
287 define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62(<64 x i8> %x) {
288 ; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
290 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
291 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
292 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
293 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
294 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
295 ; AVX512F-NEXT: vpsrld $8, %ymm0, %ymm0
296 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
297 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
298 ; AVX512F-NEXT: vzeroupper
301 ; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
303 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
304 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
305 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
306 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
307 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
308 ; AVX512VL-NEXT: vpsrld $8, %ymm0, %ymm0
309 ; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
310 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
311 ; AVX512VL-NEXT: vzeroupper
312 ; AVX512VL-NEXT: retq
314 ; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
316 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
317 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
318 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
319 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
320 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
321 ; AVX512BW-NEXT: vpsrld $8, %ymm0, %ymm0
322 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
323 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
324 ; AVX512BW-NEXT: vzeroupper
325 ; AVX512BW-NEXT: retq
327 ; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
328 ; AVX512BWVL: # %bb.0:
329 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
330 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
331 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
332 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
333 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
334 ; AVX512BWVL-NEXT: vpsrld $8, %ymm0, %ymm0
335 ; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
336 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
337 ; AVX512BWVL-NEXT: vzeroupper
338 ; AVX512BWVL-NEXT: retq
340 ; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
341 ; AVX512VBMI: # %bb.0:
342 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62]
343 ; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
344 ; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
345 ; AVX512VBMI-NEXT: vzeroupper
346 ; AVX512VBMI-NEXT: retq
348 ; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
349 ; AVX512VBMIVL: # %bb.0:
350 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62]
351 ; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
352 ; AVX512VBMIVL-NEXT: vpermt2b %ymm2, %ymm1, %ymm0
353 ; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
354 ; AVX512VBMIVL-NEXT: vzeroupper
355 ; AVX512VBMIVL-NEXT: retq
356 %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 62>
360 define <4 x double> @PR34175(ptr %p) {
361 ; AVX512F-LABEL: PR34175:
363 ; AVX512F-NEXT: vmovdqu (%rdi), %xmm0
364 ; AVX512F-NEXT: vmovdqu 32(%rdi), %xmm1
365 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
366 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
367 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
368 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
369 ; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0
372 ; AVX512VL-LABEL: PR34175:
374 ; AVX512VL-NEXT: vmovdqu (%rdi), %xmm0
375 ; AVX512VL-NEXT: vmovdqu 32(%rdi), %xmm1
376 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
377 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
378 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
379 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
380 ; AVX512VL-NEXT: vcvtdq2pd %xmm0, %ymm0
381 ; AVX512VL-NEXT: retq
383 ; AVX512BW-LABEL: PR34175:
385 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,8,32,40,0,8,32,40,0,8,32,40,0,8,32,40]
386 ; AVX512BW-NEXT: vmovdqu (%rdi), %ymm1
387 ; AVX512BW-NEXT: vmovdqu 32(%rdi), %ymm2
388 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
389 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
390 ; AVX512BW-NEXT: vcvtdq2pd %xmm0, %ymm0
391 ; AVX512BW-NEXT: retq
393 ; AVX512BWVL-LABEL: PR34175:
394 ; AVX512BWVL: # %bb.0:
395 ; AVX512BWVL-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,8,16,24,0,8,16,24]
396 ; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm1
397 ; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
398 ; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
399 ; AVX512BWVL-NEXT: vcvtdq2pd %xmm0, %ymm0
400 ; AVX512BWVL-NEXT: retq
402 ; AVX512VBMI-LABEL: PR34175:
403 ; AVX512VBMI: # %bb.0:
404 ; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,8,32,40,0,8,32,40,0,8,32,40,0,8,32,40]
405 ; AVX512VBMI-NEXT: vmovdqu (%rdi), %ymm1
406 ; AVX512VBMI-NEXT: vmovdqu 32(%rdi), %ymm2
407 ; AVX512VBMI-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
408 ; AVX512VBMI-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
409 ; AVX512VBMI-NEXT: vcvtdq2pd %xmm0, %ymm0
410 ; AVX512VBMI-NEXT: retq
412 ; AVX512VBMIVL-LABEL: PR34175:
413 ; AVX512VBMIVL: # %bb.0:
414 ; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,8,16,24,0,8,16,24]
415 ; AVX512VBMIVL-NEXT: vmovdqu (%rdi), %ymm1
416 ; AVX512VBMIVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
417 ; AVX512VBMIVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
418 ; AVX512VBMIVL-NEXT: vcvtdq2pd %xmm0, %ymm0
419 ; AVX512VBMIVL-NEXT: retq
420 %v = load <32 x i16>, ptr %p, align 2
421 %shuf = shufflevector <32 x i16> %v, <32 x i16> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
422 %tofp = uitofp <4 x i16> %shuf to <4 x double>
423 ret <4 x double> %tofp
426 define <16 x i8> @trunc_v8i64_to_v8i8_return_v16i8(<8 x i64> %vec) nounwind {
427 ; AVX512-LABEL: trunc_v8i64_to_v8i8_return_v16i8:
429 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
430 ; AVX512-NEXT: vzeroupper
432 %truncated = trunc <8 x i64> %vec to <8 x i8>
433 %result = shufflevector <8 x i8> %truncated, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
434 ret <16 x i8> %result