1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL,AVX512VL-FAST-ALL
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL,AVX512VL-FAST-PERLANE
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512BW-FAST-ALL
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512BW-FAST-PERLANE
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL,AVX512BWVL-FAST-ALL
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL,AVX512BWVL-FAST-PERLANE
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMIVL
12 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMIVL
15 ; Pairs of shufflevector:trunc functions with functional equivalence.
16 ; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
18 define void @shuffle_v64i8_to_v32i8(ptr %L, ptr %S) nounwind {
19 ; AVX512F-LABEL: shuffle_v64i8_to_v32i8:
21 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
22 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
23 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
24 ; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
25 ; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
26 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
27 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
28 ; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
29 ; AVX512F-NEXT: vzeroupper
32 ; AVX512VL-FAST-ALL-LABEL: shuffle_v64i8_to_v32i8:
33 ; AVX512VL-FAST-ALL: # %bb.0:
34 ; AVX512VL-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm0
35 ; AVX512VL-FAST-ALL-NEXT: vmovdqa 32(%rdi), %ymm1
36 ; AVX512VL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
37 ; AVX512VL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
38 ; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,2,5,7]
39 ; AVX512VL-FAST-ALL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
40 ; AVX512VL-FAST-ALL-NEXT: vmovdqa %ymm2, (%rsi)
41 ; AVX512VL-FAST-ALL-NEXT: vzeroupper
42 ; AVX512VL-FAST-ALL-NEXT: retq
44 ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v64i8_to_v32i8:
45 ; AVX512VL-FAST-PERLANE: # %bb.0:
46 ; AVX512VL-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0
47 ; AVX512VL-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1
48 ; AVX512VL-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
49 ; AVX512VL-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1
50 ; AVX512VL-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0
51 ; AVX512VL-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
52 ; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
53 ; AVX512VL-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rsi)
54 ; AVX512VL-FAST-PERLANE-NEXT: vzeroupper
55 ; AVX512VL-FAST-PERLANE-NEXT: retq
57 ; AVX512BW-LABEL: shuffle_v64i8_to_v32i8:
59 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
60 ; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi)
61 ; AVX512BW-NEXT: vzeroupper
64 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8:
65 ; AVX512BWVL: # %bb.0:
66 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
67 ; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi)
68 ; AVX512BWVL-NEXT: vzeroupper
69 ; AVX512BWVL-NEXT: retq
71 ; AVX512VBMI-LABEL: shuffle_v64i8_to_v32i8:
72 ; AVX512VBMI: # %bb.0:
73 ; AVX512VBMI-NEXT: vmovdqa64 (%rdi), %zmm0
74 ; AVX512VBMI-NEXT: vpmovwb %zmm0, (%rsi)
75 ; AVX512VBMI-NEXT: vzeroupper
76 ; AVX512VBMI-NEXT: retq
78 ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v32i8:
79 ; AVX512VBMIVL: # %bb.0:
80 ; AVX512VBMIVL-NEXT: vmovdqa64 (%rdi), %zmm0
81 ; AVX512VBMIVL-NEXT: vpmovwb %zmm0, (%rsi)
82 ; AVX512VBMIVL-NEXT: vzeroupper
83 ; AVX512VBMIVL-NEXT: retq
84 %vec = load <64 x i8>, ptr %L
85 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
86 store <32 x i8> %strided.vec, ptr %S
90 define void @trunc_v32i16_to_v32i8(ptr %L, ptr %S) nounwind {
91 ; AVX512F-LABEL: trunc_v32i16_to_v32i8:
93 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
94 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
95 ; AVX512F-NEXT: vpmovdb %zmm1, 16(%rsi)
96 ; AVX512F-NEXT: vpmovdb %zmm0, (%rsi)
97 ; AVX512F-NEXT: vzeroupper
100 ; AVX512VL-LABEL: trunc_v32i16_to_v32i8:
102 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
103 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
104 ; AVX512VL-NEXT: vpmovdb %zmm1, 16(%rsi)
105 ; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi)
106 ; AVX512VL-NEXT: vzeroupper
107 ; AVX512VL-NEXT: retq
109 ; AVX512BW-LABEL: trunc_v32i16_to_v32i8:
111 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
112 ; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi)
113 ; AVX512BW-NEXT: vzeroupper
114 ; AVX512BW-NEXT: retq
116 ; AVX512BWVL-LABEL: trunc_v32i16_to_v32i8:
117 ; AVX512BWVL: # %bb.0:
118 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
119 ; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi)
120 ; AVX512BWVL-NEXT: vzeroupper
121 ; AVX512BWVL-NEXT: retq
123 ; AVX512VBMI-LABEL: trunc_v32i16_to_v32i8:
124 ; AVX512VBMI: # %bb.0:
125 ; AVX512VBMI-NEXT: vmovdqa64 (%rdi), %zmm0
126 ; AVX512VBMI-NEXT: vpmovwb %zmm0, (%rsi)
127 ; AVX512VBMI-NEXT: vzeroupper
128 ; AVX512VBMI-NEXT: retq
130 ; AVX512VBMIVL-LABEL: trunc_v32i16_to_v32i8:
131 ; AVX512VBMIVL: # %bb.0:
132 ; AVX512VBMIVL-NEXT: vmovdqa64 (%rdi), %zmm0
133 ; AVX512VBMIVL-NEXT: vpmovwb %zmm0, (%rsi)
134 ; AVX512VBMIVL-NEXT: vzeroupper
135 ; AVX512VBMIVL-NEXT: retq
136 %vec = load <64 x i8>, ptr %L
137 %bc = bitcast <64 x i8> %vec to <32 x i16>
138 %strided.vec = trunc <32 x i16> %bc to <32 x i8>
139 store <32 x i8> %strided.vec, ptr %S
143 define void @shuffle_v32i16_to_v16i16(ptr %L, ptr %S) nounwind {
144 ; AVX512-LABEL: shuffle_v32i16_to_v16i16:
146 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
147 ; AVX512-NEXT: vpmovdw %zmm0, (%rsi)
148 ; AVX512-NEXT: vzeroupper
150 %vec = load <32 x i16>, ptr %L
151 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
152 store <16 x i16> %strided.vec, ptr %S
156 define void @trunc_v16i32_to_v16i16(ptr %L, ptr %S) nounwind {
157 ; AVX512-LABEL: trunc_v16i32_to_v16i16:
159 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
160 ; AVX512-NEXT: vpmovdw %zmm0, (%rsi)
161 ; AVX512-NEXT: vzeroupper
163 %vec = load <32 x i16>, ptr %L
164 %bc = bitcast <32 x i16> %vec to <16 x i32>
165 %strided.vec = trunc <16 x i32> %bc to <16 x i16>
166 store <16 x i16> %strided.vec, ptr %S
170 define void @shuffle_v16i32_to_v8i32(ptr %L, ptr %S) nounwind {
171 ; AVX512-LABEL: shuffle_v16i32_to_v8i32:
173 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
174 ; AVX512-NEXT: vpmovqd %zmm0, (%rsi)
175 ; AVX512-NEXT: vzeroupper
177 %vec = load <16 x i32>, ptr %L
178 %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
179 store <8 x i32> %strided.vec, ptr %S
183 define void @trunc_v8i64_to_v8i32(ptr %L, ptr %S) nounwind {
184 ; AVX512-LABEL: trunc_v8i64_to_v8i32:
186 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
187 ; AVX512-NEXT: vpmovqd %zmm0, (%rsi)
188 ; AVX512-NEXT: vzeroupper
190 %vec = load <16 x i32>, ptr %L
191 %bc = bitcast <16 x i32> %vec to <8 x i64>
192 %strided.vec = trunc <8 x i64> %bc to <8 x i32>
193 store <8 x i32> %strided.vec, ptr %S
197 define void @shuffle_v64i8_to_v16i8(ptr %L, ptr %S) nounwind {
198 ; AVX512-LABEL: shuffle_v64i8_to_v16i8:
200 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
201 ; AVX512-NEXT: vpmovdb %zmm0, (%rsi)
202 ; AVX512-NEXT: vzeroupper
204 %vec = load <64 x i8>, ptr %L
205 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
206 store <16 x i8> %strided.vec, ptr %S
210 define void @trunc_v16i32_to_v16i8(ptr %L, ptr %S) nounwind {
211 ; AVX512-LABEL: trunc_v16i32_to_v16i8:
213 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
214 ; AVX512-NEXT: vpmovdb %zmm0, (%rsi)
215 ; AVX512-NEXT: vzeroupper
217 %vec = load <64 x i8>, ptr %L
218 %bc = bitcast <64 x i8> %vec to <16 x i32>
219 %strided.vec = trunc <16 x i32> %bc to <16 x i8>
220 store <16 x i8> %strided.vec, ptr %S
224 define void @shuffle_v32i16_to_v8i16(ptr %L, ptr %S) nounwind {
225 ; AVX512-LABEL: shuffle_v32i16_to_v8i16:
227 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
228 ; AVX512-NEXT: vpmovqw %zmm0, (%rsi)
229 ; AVX512-NEXT: vzeroupper
231 %vec = load <32 x i16>, ptr %L
232 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
233 store <8 x i16> %strided.vec, ptr %S
237 define void @trunc_v8i64_to_v8i16(ptr %L, ptr %S) nounwind {
238 ; AVX512-LABEL: trunc_v8i64_to_v8i16:
240 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
241 ; AVX512-NEXT: vpmovqw %zmm0, (%rsi)
242 ; AVX512-NEXT: vzeroupper
244 %vec = load <32 x i16>, ptr %L
245 %bc = bitcast <32 x i16> %vec to <8 x i64>
246 %strided.vec = trunc <8 x i64> %bc to <8 x i16>
247 store <8 x i16> %strided.vec, ptr %S
251 define void @shuffle_v64i8_to_v8i8(ptr %L, ptr %S) nounwind {
252 ; AVX512-LABEL: shuffle_v64i8_to_v8i8:
254 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
255 ; AVX512-NEXT: vpmovqb %zmm0, (%rsi)
256 ; AVX512-NEXT: vzeroupper
258 %vec = load <64 x i8>, ptr %L
259 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
260 store <8 x i8> %strided.vec, ptr %S
264 define void @trunc_v8i64_to_v8i8(ptr %L, ptr %S) nounwind {
265 ; AVX512-LABEL: trunc_v8i64_to_v8i8:
267 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
268 ; AVX512-NEXT: vpmovqb %zmm0, (%rsi)
269 ; AVX512-NEXT: vzeroupper
271 %vec = load <64 x i8>, ptr %L
272 %bc = bitcast <64 x i8> %vec to <8 x i64>
273 %strided.vec = trunc <8 x i64> %bc to <8 x i8>
274 store <8 x i8> %strided.vec, ptr %S
278 define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61(<64 x i8> %x) {
279 ; AVX512-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
281 ; AVX512-NEXT: vpsrld $8, %zmm0, %zmm0
282 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
283 ; AVX512-NEXT: vzeroupper
285 %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
289 define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62(<64 x i8> %x) {
290 ; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
292 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
293 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
294 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
295 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
296 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
297 ; AVX512F-NEXT: vpsrld $8, %ymm0, %ymm0
298 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
299 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
300 ; AVX512F-NEXT: vzeroupper
303 ; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
305 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
306 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
307 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
308 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
309 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
310 ; AVX512VL-NEXT: vpsrld $8, %ymm0, %ymm0
311 ; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
312 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
313 ; AVX512VL-NEXT: vzeroupper
314 ; AVX512VL-NEXT: retq
316 ; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
318 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
319 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
320 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
321 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
322 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
323 ; AVX512BW-NEXT: vpsrld $8, %ymm0, %ymm0
324 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
325 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
326 ; AVX512BW-NEXT: vzeroupper
327 ; AVX512BW-NEXT: retq
329 ; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
330 ; AVX512BWVL: # %bb.0:
331 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
332 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
333 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
334 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
335 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
336 ; AVX512BWVL-NEXT: vpsrld $8, %ymm0, %ymm0
337 ; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
338 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
339 ; AVX512BWVL-NEXT: vzeroupper
340 ; AVX512BWVL-NEXT: retq
342 ; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
343 ; AVX512VBMI: # %bb.0:
344 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62]
345 ; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
346 ; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
347 ; AVX512VBMI-NEXT: vzeroupper
348 ; AVX512VBMI-NEXT: retq
350 ; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
351 ; AVX512VBMIVL: # %bb.0:
352 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62]
353 ; AVX512VBMIVL-NEXT: vpermb %zmm0, %zmm1, %zmm0
354 ; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
355 ; AVX512VBMIVL-NEXT: vzeroupper
356 ; AVX512VBMIVL-NEXT: retq
357 %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 62>
362 define <32 x i8> @trunc_shuffle_v32i16_v32i8_ofs1(<32 x i16> %a0) {
363 ; AVX512F-LABEL: trunc_shuffle_v32i16_v32i8_ofs1:
365 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
366 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
367 ; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
368 ; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
369 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
370 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
373 ; AVX512VL-FAST-ALL-LABEL: trunc_shuffle_v32i16_v32i8_ofs1:
374 ; AVX512VL-FAST-ALL: # %bb.0:
375 ; AVX512VL-FAST-ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
376 ; AVX512VL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
377 ; AVX512VL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
378 ; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,2,5,7]
379 ; AVX512VL-FAST-ALL-NEXT: vpermi2q %ymm1, %ymm2, %ymm0
380 ; AVX512VL-FAST-ALL-NEXT: retq
382 ; AVX512VL-FAST-PERLANE-LABEL: trunc_shuffle_v32i16_v32i8_ofs1:
383 ; AVX512VL-FAST-PERLANE: # %bb.0:
384 ; AVX512VL-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm1
385 ; AVX512VL-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
386 ; AVX512VL-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1
387 ; AVX512VL-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0
388 ; AVX512VL-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
389 ; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
390 ; AVX512VL-FAST-PERLANE-NEXT: retq
392 ; AVX512BW-LABEL: trunc_shuffle_v32i16_v32i8_ofs1:
394 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
395 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
396 ; AVX512BW-NEXT: retq
398 ; AVX512BWVL-LABEL: trunc_shuffle_v32i16_v32i8_ofs1:
399 ; AVX512BWVL: # %bb.0:
400 ; AVX512BWVL-NEXT: vpsrlw $8, %zmm0, %zmm0
401 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
402 ; AVX512BWVL-NEXT: retq
404 ; AVX512VBMI-LABEL: trunc_shuffle_v32i16_v32i8_ofs1:
405 ; AVX512VBMI: # %bb.0:
406 ; AVX512VBMI-NEXT: vpsrlw $8, %zmm0, %zmm0
407 ; AVX512VBMI-NEXT: vpmovwb %zmm0, %ymm0
408 ; AVX512VBMI-NEXT: retq
410 ; AVX512VBMIVL-LABEL: trunc_shuffle_v32i16_v32i8_ofs1:
411 ; AVX512VBMIVL: # %bb.0:
412 ; AVX512VBMIVL-NEXT: vpsrlw $8, %zmm0, %zmm0
413 ; AVX512VBMIVL-NEXT: vpmovwb %zmm0, %ymm0
414 ; AVX512VBMIVL-NEXT: retq
415 %bc = bitcast <32 x i16> %a0 to <64 x i8>
416 %res = shufflevector <64 x i8> %bc, <64 x i8> poison, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
420 define <4 x double> @PR34175(ptr %p) {
421 ; AVX512F-LABEL: PR34175:
423 ; AVX512F-NEXT: vmovdqu (%rdi), %xmm0
424 ; AVX512F-NEXT: vmovdqu 32(%rdi), %xmm1
425 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
426 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
427 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
428 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
429 ; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0
432 ; AVX512VL-LABEL: PR34175:
434 ; AVX512VL-NEXT: vmovdqu (%rdi), %xmm0
435 ; AVX512VL-NEXT: vmovdqu 32(%rdi), %xmm1
436 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
437 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
438 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
439 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
440 ; AVX512VL-NEXT: vcvtdq2pd %xmm0, %ymm0
441 ; AVX512VL-NEXT: retq
443 ; AVX512BW-LABEL: PR34175:
445 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,8,32,40,0,8,32,40,0,8,32,40,0,8,32,40]
446 ; AVX512BW-NEXT: vmovdqu (%rdi), %ymm1
447 ; AVX512BW-NEXT: vmovdqu 32(%rdi), %ymm2
448 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
449 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
450 ; AVX512BW-NEXT: vcvtdq2pd %xmm0, %ymm0
451 ; AVX512BW-NEXT: retq
453 ; AVX512BWVL-LABEL: PR34175:
454 ; AVX512BWVL: # %bb.0:
455 ; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
456 ; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm1
457 ; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
458 ; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
459 ; AVX512BWVL-NEXT: vcvtdq2pd %xmm0, %ymm0
460 ; AVX512BWVL-NEXT: retq
462 ; AVX512VBMI-LABEL: PR34175:
463 ; AVX512VBMI: # %bb.0:
464 ; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,8,32,40,0,8,32,40,0,8,32,40,0,8,32,40]
465 ; AVX512VBMI-NEXT: vmovdqu (%rdi), %ymm1
466 ; AVX512VBMI-NEXT: vmovdqu 32(%rdi), %ymm2
467 ; AVX512VBMI-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
468 ; AVX512VBMI-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
469 ; AVX512VBMI-NEXT: vcvtdq2pd %xmm0, %ymm0
470 ; AVX512VBMI-NEXT: retq
472 ; AVX512VBMIVL-LABEL: PR34175:
473 ; AVX512VBMIVL: # %bb.0:
474 ; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
475 ; AVX512VBMIVL-NEXT: vmovdqu (%rdi), %ymm1
476 ; AVX512VBMIVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
477 ; AVX512VBMIVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
478 ; AVX512VBMIVL-NEXT: vcvtdq2pd %xmm0, %ymm0
479 ; AVX512VBMIVL-NEXT: retq
480 %v = load <32 x i16>, ptr %p, align 2
481 %shuf = shufflevector <32 x i16> %v, <32 x i16> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
482 %tofp = uitofp <4 x i16> %shuf to <4 x double>
483 ret <4 x double> %tofp
486 define <16 x i8> @trunc_v8i64_to_v8i8_return_v16i8(<8 x i64> %vec) nounwind {
487 ; AVX512-LABEL: trunc_v8i64_to_v8i8_return_v16i8:
489 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
490 ; AVX512-NEXT: vzeroupper
492 %truncated = trunc <8 x i64> %vec to <8 x i8>
493 %result = shufflevector <8 x i8> %truncated, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
494 ret <16 x i8> %result
497 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
498 ; AVX512BW-FAST-ALL: {{.*}}
499 ; AVX512BW-FAST-PERLANE: {{.*}}
500 ; AVX512BWVL-FAST-ALL: {{.*}}
501 ; AVX512BWVL-FAST-PERLANE: {{.*}}