1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL,AVX512VL-FAST-ALL
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL,AVX512VL-FAST-PERLANE
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMIVL
12 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMIVL
15 ; Pairs of shufflevector:trunc functions with functional equivalence.
16 ; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
18 define void @shuffle_v64i8_to_v32i8(ptr %L, ptr %S) nounwind {
19 ; AVX512F-LABEL: shuffle_v64i8_to_v32i8:
21 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
22 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
23 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
24 ; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
25 ; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
26 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
27 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
28 ; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
29 ; AVX512F-NEXT: vzeroupper
32 ; AVX512VL-FAST-ALL-LABEL: shuffle_v64i8_to_v32i8:
33 ; AVX512VL-FAST-ALL: # %bb.0:
34 ; AVX512VL-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm0
35 ; AVX512VL-FAST-ALL-NEXT: vmovdqa 32(%rdi), %ymm1
36 ; AVX512VL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
37 ; AVX512VL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
38 ; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,2,5,7]
39 ; AVX512VL-FAST-ALL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
40 ; AVX512VL-FAST-ALL-NEXT: vmovdqa %ymm2, (%rsi)
41 ; AVX512VL-FAST-ALL-NEXT: vzeroupper
42 ; AVX512VL-FAST-ALL-NEXT: retq
44 ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v64i8_to_v32i8:
45 ; AVX512VL-FAST-PERLANE: # %bb.0:
46 ; AVX512VL-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0
47 ; AVX512VL-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1
48 ; AVX512VL-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
49 ; AVX512VL-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1
50 ; AVX512VL-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0
51 ; AVX512VL-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
52 ; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
53 ; AVX512VL-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rsi)
54 ; AVX512VL-FAST-PERLANE-NEXT: vzeroupper
55 ; AVX512VL-FAST-PERLANE-NEXT: retq
57 ; AVX512BW-LABEL: shuffle_v64i8_to_v32i8:
59 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
60 ; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi)
61 ; AVX512BW-NEXT: vzeroupper
64 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8:
65 ; AVX512BWVL: # %bb.0:
66 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
67 ; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi)
68 ; AVX512BWVL-NEXT: vzeroupper
69 ; AVX512BWVL-NEXT: retq
71 ; AVX512VBMI-LABEL: shuffle_v64i8_to_v32i8:
72 ; AVX512VBMI: # %bb.0:
73 ; AVX512VBMI-NEXT: vmovdqa64 (%rdi), %zmm0
74 ; AVX512VBMI-NEXT: vpmovwb %zmm0, (%rsi)
75 ; AVX512VBMI-NEXT: vzeroupper
76 ; AVX512VBMI-NEXT: retq
78 ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v32i8:
79 ; AVX512VBMIVL: # %bb.0:
80 ; AVX512VBMIVL-NEXT: vmovdqa64 (%rdi), %zmm0
81 ; AVX512VBMIVL-NEXT: vpmovwb %zmm0, (%rsi)
82 ; AVX512VBMIVL-NEXT: vzeroupper
83 ; AVX512VBMIVL-NEXT: retq
84 %vec = load <64 x i8>, ptr %L
85 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
86 store <32 x i8> %strided.vec, ptr %S
90 define void @trunc_v32i16_to_v32i8(ptr %L, ptr %S) nounwind {
91 ; AVX512F-LABEL: trunc_v32i16_to_v32i8:
93 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
94 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
95 ; AVX512F-NEXT: vpmovdb %zmm1, 16(%rsi)
96 ; AVX512F-NEXT: vpmovdb %zmm0, (%rsi)
97 ; AVX512F-NEXT: vzeroupper
100 ; AVX512VL-LABEL: trunc_v32i16_to_v32i8:
102 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
103 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
104 ; AVX512VL-NEXT: vpmovdb %zmm1, 16(%rsi)
105 ; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi)
106 ; AVX512VL-NEXT: vzeroupper
107 ; AVX512VL-NEXT: retq
109 ; AVX512BW-LABEL: trunc_v32i16_to_v32i8:
111 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
112 ; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi)
113 ; AVX512BW-NEXT: vzeroupper
114 ; AVX512BW-NEXT: retq
116 ; AVX512BWVL-LABEL: trunc_v32i16_to_v32i8:
117 ; AVX512BWVL: # %bb.0:
118 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
119 ; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi)
120 ; AVX512BWVL-NEXT: vzeroupper
121 ; AVX512BWVL-NEXT: retq
123 ; AVX512VBMI-LABEL: trunc_v32i16_to_v32i8:
124 ; AVX512VBMI: # %bb.0:
125 ; AVX512VBMI-NEXT: vmovdqa64 (%rdi), %zmm0
126 ; AVX512VBMI-NEXT: vpmovwb %zmm0, (%rsi)
127 ; AVX512VBMI-NEXT: vzeroupper
128 ; AVX512VBMI-NEXT: retq
130 ; AVX512VBMIVL-LABEL: trunc_v32i16_to_v32i8:
131 ; AVX512VBMIVL: # %bb.0:
132 ; AVX512VBMIVL-NEXT: vmovdqa64 (%rdi), %zmm0
133 ; AVX512VBMIVL-NEXT: vpmovwb %zmm0, (%rsi)
134 ; AVX512VBMIVL-NEXT: vzeroupper
135 ; AVX512VBMIVL-NEXT: retq
136 %vec = load <64 x i8>, ptr %L
137 %bc = bitcast <64 x i8> %vec to <32 x i16>
138 %strided.vec = trunc <32 x i16> %bc to <32 x i8>
139 store <32 x i8> %strided.vec, ptr %S
143 define void @shuffle_v32i16_to_v16i16(ptr %L, ptr %S) nounwind {
144 ; AVX512-LABEL: shuffle_v32i16_to_v16i16:
146 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
147 ; AVX512-NEXT: vpmovdw %zmm0, (%rsi)
148 ; AVX512-NEXT: vzeroupper
150 %vec = load <32 x i16>, ptr %L
151 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
152 store <16 x i16> %strided.vec, ptr %S
156 define void @trunc_v16i32_to_v16i16(ptr %L, ptr %S) nounwind {
157 ; AVX512-LABEL: trunc_v16i32_to_v16i16:
159 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
160 ; AVX512-NEXT: vpmovdw %zmm0, (%rsi)
161 ; AVX512-NEXT: vzeroupper
163 %vec = load <32 x i16>, ptr %L
164 %bc = bitcast <32 x i16> %vec to <16 x i32>
165 %strided.vec = trunc <16 x i32> %bc to <16 x i16>
166 store <16 x i16> %strided.vec, ptr %S
170 define void @shuffle_v16i32_to_v8i32(ptr %L, ptr %S) nounwind {
171 ; AVX512-LABEL: shuffle_v16i32_to_v8i32:
173 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
174 ; AVX512-NEXT: vpmovqd %zmm0, (%rsi)
175 ; AVX512-NEXT: vzeroupper
177 %vec = load <16 x i32>, ptr %L
178 %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
179 store <8 x i32> %strided.vec, ptr %S
183 define void @trunc_v8i64_to_v8i32(ptr %L, ptr %S) nounwind {
184 ; AVX512-LABEL: trunc_v8i64_to_v8i32:
186 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
187 ; AVX512-NEXT: vpmovqd %zmm0, (%rsi)
188 ; AVX512-NEXT: vzeroupper
190 %vec = load <16 x i32>, ptr %L
191 %bc = bitcast <16 x i32> %vec to <8 x i64>
192 %strided.vec = trunc <8 x i64> %bc to <8 x i32>
193 store <8 x i32> %strided.vec, ptr %S
197 define void @shuffle_v64i8_to_v16i8(ptr %L, ptr %S) nounwind {
198 ; AVX512-LABEL: shuffle_v64i8_to_v16i8:
200 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
201 ; AVX512-NEXT: vpmovdb %zmm0, (%rsi)
202 ; AVX512-NEXT: vzeroupper
204 %vec = load <64 x i8>, ptr %L
205 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
206 store <16 x i8> %strided.vec, ptr %S
210 define void @trunc_v16i32_to_v16i8(ptr %L, ptr %S) nounwind {
211 ; AVX512-LABEL: trunc_v16i32_to_v16i8:
213 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
214 ; AVX512-NEXT: vpmovdb %zmm0, (%rsi)
215 ; AVX512-NEXT: vzeroupper
217 %vec = load <64 x i8>, ptr %L
218 %bc = bitcast <64 x i8> %vec to <16 x i32>
219 %strided.vec = trunc <16 x i32> %bc to <16 x i8>
220 store <16 x i8> %strided.vec, ptr %S
224 define void @shuffle_v32i16_to_v8i16(ptr %L, ptr %S) nounwind {
225 ; AVX512-LABEL: shuffle_v32i16_to_v8i16:
227 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
228 ; AVX512-NEXT: vpmovqw %zmm0, (%rsi)
229 ; AVX512-NEXT: vzeroupper
231 %vec = load <32 x i16>, ptr %L
232 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
233 store <8 x i16> %strided.vec, ptr %S
237 define void @trunc_v8i64_to_v8i16(ptr %L, ptr %S) nounwind {
238 ; AVX512-LABEL: trunc_v8i64_to_v8i16:
240 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
241 ; AVX512-NEXT: vpmovqw %zmm0, (%rsi)
242 ; AVX512-NEXT: vzeroupper
244 %vec = load <32 x i16>, ptr %L
245 %bc = bitcast <32 x i16> %vec to <8 x i64>
246 %strided.vec = trunc <8 x i64> %bc to <8 x i16>
247 store <8 x i16> %strided.vec, ptr %S
251 define void @shuffle_v64i8_to_v8i8(ptr %L, ptr %S) nounwind {
252 ; AVX512-LABEL: shuffle_v64i8_to_v8i8:
254 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
255 ; AVX512-NEXT: vpmovqb %zmm0, (%rsi)
256 ; AVX512-NEXT: vzeroupper
258 %vec = load <64 x i8>, ptr %L
259 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
260 store <8 x i8> %strided.vec, ptr %S
264 define void @trunc_v8i64_to_v8i8(ptr %L, ptr %S) nounwind {
265 ; AVX512-LABEL: trunc_v8i64_to_v8i8:
267 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
268 ; AVX512-NEXT: vpmovqb %zmm0, (%rsi)
269 ; AVX512-NEXT: vzeroupper
271 %vec = load <64 x i8>, ptr %L
272 %bc = bitcast <64 x i8> %vec to <8 x i64>
273 %strided.vec = trunc <8 x i64> %bc to <8 x i8>
274 store <8 x i8> %strided.vec, ptr %S
278 define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61(<64 x i8> %x) {
279 ; AVX512-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
281 ; AVX512-NEXT: vpsrld $8, %zmm0, %zmm0
282 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
283 ; AVX512-NEXT: vzeroupper
285 %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
289 define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62(<64 x i8> %x) {
290 ; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
292 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
293 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
294 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
295 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
296 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
297 ; AVX512F-NEXT: vpsrld $8, %ymm0, %ymm0
298 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
299 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
300 ; AVX512F-NEXT: vzeroupper
303 ; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
305 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
306 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
307 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
308 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
309 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
310 ; AVX512VL-NEXT: vpsrld $8, %ymm0, %ymm0
311 ; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
312 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
313 ; AVX512VL-NEXT: vzeroupper
314 ; AVX512VL-NEXT: retq
316 ; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
318 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
319 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
320 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
321 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
322 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
323 ; AVX512BW-NEXT: vpsrld $8, %ymm0, %ymm0
324 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
325 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
326 ; AVX512BW-NEXT: vzeroupper
327 ; AVX512BW-NEXT: retq
329 ; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
330 ; AVX512BWVL: # %bb.0:
331 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
332 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
333 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
334 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
335 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
336 ; AVX512BWVL-NEXT: vpsrld $8, %ymm0, %ymm0
337 ; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
338 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
339 ; AVX512BWVL-NEXT: vzeroupper
340 ; AVX512BWVL-NEXT: retq
342 ; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
343 ; AVX512VBMI: # %bb.0:
344 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62]
345 ; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
346 ; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
347 ; AVX512VBMI-NEXT: vzeroupper
348 ; AVX512VBMI-NEXT: retq
350 ; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
351 ; AVX512VBMIVL: # %bb.0:
352 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62]
353 ; AVX512VBMIVL-NEXT: vpermb %zmm0, %zmm1, %zmm0
354 ; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
355 ; AVX512VBMIVL-NEXT: vzeroupper
356 ; AVX512VBMIVL-NEXT: retq
357 %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 62>
361 define <4 x double> @PR34175(ptr %p) {
362 ; AVX512F-LABEL: PR34175:
364 ; AVX512F-NEXT: vmovdqu (%rdi), %xmm0
365 ; AVX512F-NEXT: vmovdqu 32(%rdi), %xmm1
366 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
367 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
368 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
369 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
370 ; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0
373 ; AVX512VL-LABEL: PR34175:
375 ; AVX512VL-NEXT: vmovdqu (%rdi), %xmm0
376 ; AVX512VL-NEXT: vmovdqu 32(%rdi), %xmm1
377 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
378 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
379 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
380 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
381 ; AVX512VL-NEXT: vcvtdq2pd %xmm0, %ymm0
382 ; AVX512VL-NEXT: retq
384 ; AVX512BW-LABEL: PR34175:
386 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,8,32,40,0,8,32,40,0,8,32,40,0,8,32,40]
387 ; AVX512BW-NEXT: vmovdqu (%rdi), %ymm1
388 ; AVX512BW-NEXT: vmovdqu 32(%rdi), %ymm2
389 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
390 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
391 ; AVX512BW-NEXT: vcvtdq2pd %xmm0, %ymm0
392 ; AVX512BW-NEXT: retq
394 ; AVX512BWVL-LABEL: PR34175:
395 ; AVX512BWVL: # %bb.0:
396 ; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
397 ; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm1
398 ; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
399 ; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
400 ; AVX512BWVL-NEXT: vcvtdq2pd %xmm0, %ymm0
401 ; AVX512BWVL-NEXT: retq
403 ; AVX512VBMI-LABEL: PR34175:
404 ; AVX512VBMI: # %bb.0:
405 ; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,8,32,40,0,8,32,40,0,8,32,40,0,8,32,40]
406 ; AVX512VBMI-NEXT: vmovdqu (%rdi), %ymm1
407 ; AVX512VBMI-NEXT: vmovdqu 32(%rdi), %ymm2
408 ; AVX512VBMI-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
409 ; AVX512VBMI-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
410 ; AVX512VBMI-NEXT: vcvtdq2pd %xmm0, %ymm0
411 ; AVX512VBMI-NEXT: retq
413 ; AVX512VBMIVL-LABEL: PR34175:
414 ; AVX512VBMIVL: # %bb.0:
415 ; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
416 ; AVX512VBMIVL-NEXT: vmovdqu (%rdi), %ymm1
417 ; AVX512VBMIVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
418 ; AVX512VBMIVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
419 ; AVX512VBMIVL-NEXT: vcvtdq2pd %xmm0, %ymm0
420 ; AVX512VBMIVL-NEXT: retq
421 %v = load <32 x i16>, ptr %p, align 2
422 %shuf = shufflevector <32 x i16> %v, <32 x i16> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
423 %tofp = uitofp <4 x i16> %shuf to <4 x double>
424 ret <4 x double> %tofp
427 define <16 x i8> @trunc_v8i64_to_v8i8_return_v16i8(<8 x i64> %vec) nounwind {
428 ; AVX512-LABEL: trunc_v8i64_to_v8i8_return_v16i8:
430 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
431 ; AVX512-NEXT: vzeroupper
433 %truncated = trunc <8 x i64> %vec to <8 x i8>
434 %result = shufflevector <8 x i8> %truncated, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
435 ret <16 x i8> %result