1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL,AVX512VL-FAST-ALL
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL,AVX512VL-FAST-PERLANE
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMIVL
12 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMIVL
15 ; Pairs of shufflevector:trunc functions with functional equivalence.
16 ; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
18 define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
19 ; AVX512F-LABEL: shuffle_v64i8_to_v32i8:
21 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
22 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
23 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
24 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
25 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
26 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
27 ; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
28 ; AVX512F-NEXT: vzeroupper
31 ; AVX512VL-FAST-ALL-LABEL: shuffle_v64i8_to_v32i8:
32 ; AVX512VL-FAST-ALL: # %bb.0:
33 ; AVX512VL-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm0
34 ; AVX512VL-FAST-ALL-NEXT: vmovdqa 32(%rdi), %ymm1
35 ; AVX512VL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
36 ; AVX512VL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
37 ; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
38 ; AVX512VL-FAST-ALL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
39 ; AVX512VL-FAST-ALL-NEXT: vmovdqa %ymm2, (%rsi)
40 ; AVX512VL-FAST-ALL-NEXT: vzeroupper
41 ; AVX512VL-FAST-ALL-NEXT: retq
43 ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v64i8_to_v32i8:
44 ; AVX512VL-FAST-PERLANE: # %bb.0:
45 ; AVX512VL-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0
46 ; AVX512VL-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1
47 ; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
48 ; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
49 ; AVX512VL-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
50 ; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
51 ; AVX512VL-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rsi)
52 ; AVX512VL-FAST-PERLANE-NEXT: vzeroupper
53 ; AVX512VL-FAST-PERLANE-NEXT: retq
55 ; AVX512BW-LABEL: shuffle_v64i8_to_v32i8:
57 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
58 ; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi)
59 ; AVX512BW-NEXT: vzeroupper
62 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8:
63 ; AVX512BWVL: # %bb.0:
64 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
65 ; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi)
66 ; AVX512BWVL-NEXT: vzeroupper
67 ; AVX512BWVL-NEXT: retq
69 ; AVX512VBMI-LABEL: shuffle_v64i8_to_v32i8:
70 ; AVX512VBMI: # %bb.0:
71 ; AVX512VBMI-NEXT: vmovdqa64 (%rdi), %zmm0
72 ; AVX512VBMI-NEXT: vpmovwb %zmm0, (%rsi)
73 ; AVX512VBMI-NEXT: vzeroupper
74 ; AVX512VBMI-NEXT: retq
76 ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v32i8:
77 ; AVX512VBMIVL: # %bb.0:
78 ; AVX512VBMIVL-NEXT: vmovdqa64 (%rdi), %zmm0
79 ; AVX512VBMIVL-NEXT: vpmovwb %zmm0, (%rsi)
80 ; AVX512VBMIVL-NEXT: vzeroupper
81 ; AVX512VBMIVL-NEXT: retq
82 %vec = load <64 x i8>, <64 x i8>* %L
83 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
84 store <32 x i8> %strided.vec, <32 x i8>* %S
88 define void @trunc_v32i16_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
89 ; AVX512F-LABEL: trunc_v32i16_to_v32i8:
91 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
92 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
93 ; AVX512F-NEXT: vpmovdb %zmm1, 16(%rsi)
94 ; AVX512F-NEXT: vpmovdb %zmm0, (%rsi)
95 ; AVX512F-NEXT: vzeroupper
98 ; AVX512VL-LABEL: trunc_v32i16_to_v32i8:
100 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
101 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
102 ; AVX512VL-NEXT: vpmovdb %zmm1, 16(%rsi)
103 ; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi)
104 ; AVX512VL-NEXT: vzeroupper
105 ; AVX512VL-NEXT: retq
107 ; AVX512BW-LABEL: trunc_v32i16_to_v32i8:
109 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
110 ; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi)
111 ; AVX512BW-NEXT: vzeroupper
112 ; AVX512BW-NEXT: retq
114 ; AVX512BWVL-LABEL: trunc_v32i16_to_v32i8:
115 ; AVX512BWVL: # %bb.0:
116 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
117 ; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi)
118 ; AVX512BWVL-NEXT: vzeroupper
119 ; AVX512BWVL-NEXT: retq
121 ; AVX512VBMI-LABEL: trunc_v32i16_to_v32i8:
122 ; AVX512VBMI: # %bb.0:
123 ; AVX512VBMI-NEXT: vmovdqa64 (%rdi), %zmm0
124 ; AVX512VBMI-NEXT: vpmovwb %zmm0, (%rsi)
125 ; AVX512VBMI-NEXT: vzeroupper
126 ; AVX512VBMI-NEXT: retq
128 ; AVX512VBMIVL-LABEL: trunc_v32i16_to_v32i8:
129 ; AVX512VBMIVL: # %bb.0:
130 ; AVX512VBMIVL-NEXT: vmovdqa64 (%rdi), %zmm0
131 ; AVX512VBMIVL-NEXT: vpmovwb %zmm0, (%rsi)
132 ; AVX512VBMIVL-NEXT: vzeroupper
133 ; AVX512VBMIVL-NEXT: retq
134 %vec = load <64 x i8>, <64 x i8>* %L
135 %bc = bitcast <64 x i8> %vec to <32 x i16>
136 %strided.vec = trunc <32 x i16> %bc to <32 x i8>
137 store <32 x i8> %strided.vec, <32 x i8>* %S
141 define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
142 ; AVX512-LABEL: shuffle_v32i16_to_v16i16:
144 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
145 ; AVX512-NEXT: vpmovdw %zmm0, (%rsi)
146 ; AVX512-NEXT: vzeroupper
148 %vec = load <32 x i16>, <32 x i16>* %L
149 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
150 store <16 x i16> %strided.vec, <16 x i16>* %S
154 define void @trunc_v16i32_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
155 ; AVX512-LABEL: trunc_v16i32_to_v16i16:
157 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
158 ; AVX512-NEXT: vpmovdw %zmm0, (%rsi)
159 ; AVX512-NEXT: vzeroupper
161 %vec = load <32 x i16>, <32 x i16>* %L
162 %bc = bitcast <32 x i16> %vec to <16 x i32>
163 %strided.vec = trunc <16 x i32> %bc to <16 x i16>
164 store <16 x i16> %strided.vec, <16 x i16>* %S
168 define void @shuffle_v16i32_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
169 ; AVX512-LABEL: shuffle_v16i32_to_v8i32:
171 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
172 ; AVX512-NEXT: vpmovqd %zmm0, (%rsi)
173 ; AVX512-NEXT: vzeroupper
175 %vec = load <16 x i32>, <16 x i32>* %L
176 %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
177 store <8 x i32> %strided.vec, <8 x i32>* %S
181 define void @trunc_v8i64_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
182 ; AVX512-LABEL: trunc_v8i64_to_v8i32:
184 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
185 ; AVX512-NEXT: vpmovqd %zmm0, (%rsi)
186 ; AVX512-NEXT: vzeroupper
188 %vec = load <16 x i32>, <16 x i32>* %L
189 %bc = bitcast <16 x i32> %vec to <8 x i64>
190 %strided.vec = trunc <8 x i64> %bc to <8 x i32>
191 store <8 x i32> %strided.vec, <8 x i32>* %S
195 define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
196 ; AVX512-LABEL: shuffle_v64i8_to_v16i8:
198 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
199 ; AVX512-NEXT: vpmovdb %zmm0, (%rsi)
200 ; AVX512-NEXT: vzeroupper
202 %vec = load <64 x i8>, <64 x i8>* %L
203 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
204 store <16 x i8> %strided.vec, <16 x i8>* %S
208 define void @trunc_v16i32_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
209 ; AVX512-LABEL: trunc_v16i32_to_v16i8:
211 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
212 ; AVX512-NEXT: vpmovdb %zmm0, (%rsi)
213 ; AVX512-NEXT: vzeroupper
215 %vec = load <64 x i8>, <64 x i8>* %L
216 %bc = bitcast <64 x i8> %vec to <16 x i32>
217 %strided.vec = trunc <16 x i32> %bc to <16 x i8>
218 store <16 x i8> %strided.vec, <16 x i8>* %S
222 define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
223 ; AVX512-LABEL: shuffle_v32i16_to_v8i16:
225 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
226 ; AVX512-NEXT: vpmovqw %zmm0, (%rsi)
227 ; AVX512-NEXT: vzeroupper
229 %vec = load <32 x i16>, <32 x i16>* %L
230 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
231 store <8 x i16> %strided.vec, <8 x i16>* %S
235 define void @trunc_v8i64_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
236 ; AVX512-LABEL: trunc_v8i64_to_v8i16:
238 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
239 ; AVX512-NEXT: vpmovqw %zmm0, (%rsi)
240 ; AVX512-NEXT: vzeroupper
242 %vec = load <32 x i16>, <32 x i16>* %L
243 %bc = bitcast <32 x i16> %vec to <8 x i64>
244 %strided.vec = trunc <8 x i64> %bc to <8 x i16>
245 store <8 x i16> %strided.vec, <8 x i16>* %S
249 define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
250 ; AVX512-LABEL: shuffle_v64i8_to_v8i8:
252 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
253 ; AVX512-NEXT: vpmovqb %zmm0, (%rsi)
254 ; AVX512-NEXT: vzeroupper
256 %vec = load <64 x i8>, <64 x i8>* %L
257 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
258 store <8 x i8> %strided.vec, <8 x i8>* %S
262 define void @trunc_v8i64_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
263 ; AVX512-LABEL: trunc_v8i64_to_v8i8:
265 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
266 ; AVX512-NEXT: vpmovqb %zmm0, (%rsi)
267 ; AVX512-NEXT: vzeroupper
269 %vec = load <64 x i8>, <64 x i8>* %L
270 %bc = bitcast <64 x i8> %vec to <8 x i64>
271 %strided.vec = trunc <8 x i64> %bc to <8 x i8>
272 store <8 x i8> %strided.vec, <8 x i8>* %S
276 define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61(<64 x i8> %x) {
277 ; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
279 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
280 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
281 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
282 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
283 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
284 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
285 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
286 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
287 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
288 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
289 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
290 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
291 ; AVX512F-NEXT: vzeroupper
294 ; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
296 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
297 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
298 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
299 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
300 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
301 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
302 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
303 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
304 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
305 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
306 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
307 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
308 ; AVX512VL-NEXT: vzeroupper
309 ; AVX512VL-NEXT: retq
311 ; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
313 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
314 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
315 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
316 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
317 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
318 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
319 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
320 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
321 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
322 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
323 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
324 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
325 ; AVX512BW-NEXT: vzeroupper
326 ; AVX512BW-NEXT: retq
328 ; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
329 ; AVX512BWVL: # %bb.0:
330 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
331 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
332 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
333 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
334 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
335 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
336 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
337 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
338 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
339 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
340 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
341 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
342 ; AVX512BWVL-NEXT: vzeroupper
343 ; AVX512BWVL-NEXT: retq
345 ; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
346 ; AVX512VBMI: # %bb.0:
347 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
348 ; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
349 ; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
350 ; AVX512VBMI-NEXT: vzeroupper
351 ; AVX512VBMI-NEXT: retq
353 ; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
354 ; AVX512VBMIVL: # %bb.0:
355 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
356 ; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
357 ; AVX512VBMIVL-NEXT: vpermt2b %ymm2, %ymm1, %ymm0
358 ; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
359 ; AVX512VBMIVL-NEXT: vzeroupper
360 ; AVX512VBMIVL-NEXT: retq
361 %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
365 define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62(<64 x i8> %x) {
366 ; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
368 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
369 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
370 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
371 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm2
372 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
373 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
374 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
375 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
376 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
377 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
378 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
379 ; AVX512F-NEXT: vzeroupper
382 ; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
384 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
385 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
386 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
387 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm2
388 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
389 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
390 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
391 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
392 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
393 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
394 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
395 ; AVX512VL-NEXT: vzeroupper
396 ; AVX512VL-NEXT: retq
398 ; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
400 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
401 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
402 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
403 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm2
404 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
405 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0
406 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
407 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
408 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
409 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
410 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
411 ; AVX512BW-NEXT: vzeroupper
412 ; AVX512BW-NEXT: retq
414 ; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
415 ; AVX512BWVL: # %bb.0:
416 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
417 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
418 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
419 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm2
420 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
421 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
422 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
423 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
424 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
425 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
426 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
427 ; AVX512BWVL-NEXT: vzeroupper
428 ; AVX512BWVL-NEXT: retq
430 ; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
431 ; AVX512VBMI: # %bb.0:
432 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62]
433 ; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
434 ; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
435 ; AVX512VBMI-NEXT: vzeroupper
436 ; AVX512VBMI-NEXT: retq
438 ; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
439 ; AVX512VBMIVL: # %bb.0:
440 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62]
441 ; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
442 ; AVX512VBMIVL-NEXT: vpermt2b %ymm2, %ymm1, %ymm0
443 ; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
444 ; AVX512VBMIVL-NEXT: vzeroupper
445 ; AVX512VBMIVL-NEXT: retq
446 %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 62>
450 define <4 x double> @PR34175(<32 x i16>* %p) {
451 ; AVX512F-LABEL: PR34175:
453 ; AVX512F-NEXT: vmovdqu (%rdi), %xmm0
454 ; AVX512F-NEXT: vmovdqu 32(%rdi), %xmm1
455 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
456 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
457 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
458 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
459 ; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0
462 ; AVX512VL-LABEL: PR34175:
464 ; AVX512VL-NEXT: vmovdqu (%rdi), %xmm0
465 ; AVX512VL-NEXT: vmovdqu 32(%rdi), %xmm1
466 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
467 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
468 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
469 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
470 ; AVX512VL-NEXT: vcvtdq2pd %xmm0, %ymm0
471 ; AVX512VL-NEXT: retq
473 ; AVX512BW-LABEL: PR34175:
475 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,8,32,40,u,u,u,u,u,u,u,u,u,u,u,u>
476 ; AVX512BW-NEXT: vmovdqu (%rdi), %ymm1
477 ; AVX512BW-NEXT: vmovdqu 32(%rdi), %ymm2
478 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
479 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
480 ; AVX512BW-NEXT: vcvtdq2pd %xmm0, %ymm0
481 ; AVX512BW-NEXT: retq
483 ; AVX512BWVL-LABEL: PR34175:
484 ; AVX512BWVL: # %bb.0:
485 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = <0,8,16,24,u,u,u,u>
486 ; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm1
487 ; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
488 ; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
489 ; AVX512BWVL-NEXT: vcvtdq2pd %xmm0, %ymm0
490 ; AVX512BWVL-NEXT: retq
492 ; AVX512VBMI-LABEL: PR34175:
493 ; AVX512VBMI: # %bb.0:
494 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} ymm0 = <0,8,32,40,u,u,u,u,u,u,u,u,u,u,u,u>
495 ; AVX512VBMI-NEXT: vmovdqu (%rdi), %ymm1
496 ; AVX512VBMI-NEXT: vmovdqu 32(%rdi), %ymm2
497 ; AVX512VBMI-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
498 ; AVX512VBMI-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
499 ; AVX512VBMI-NEXT: vcvtdq2pd %xmm0, %ymm0
500 ; AVX512VBMI-NEXT: retq
502 ; AVX512VBMIVL-LABEL: PR34175:
503 ; AVX512VBMIVL: # %bb.0:
504 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = <0,8,16,24,u,u,u,u>
505 ; AVX512VBMIVL-NEXT: vmovdqu (%rdi), %ymm1
506 ; AVX512VBMIVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
507 ; AVX512VBMIVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
508 ; AVX512VBMIVL-NEXT: vcvtdq2pd %xmm0, %ymm0
509 ; AVX512VBMIVL-NEXT: retq
510 %v = load <32 x i16>, <32 x i16>* %p, align 2
511 %shuf = shufflevector <32 x i16> %v, <32 x i16> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
512 %tofp = uitofp <4 x i16> %shuf to <4 x double>
513 ret <4 x double> %tofp
516 define <16 x i8> @trunc_v8i64_to_v8i8_return_v16i8(<8 x i64> %vec) nounwind {
517 ; AVX512-LABEL: trunc_v8i64_to_v8i8_return_v16i8:
519 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
520 ; AVX512-NEXT: vzeroupper
522 %truncated = trunc <8 x i64> %vec to <8 x i8>
523 %result = shufflevector <8 x i8> %truncated, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
524 ret <16 x i8> %result