1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL,AVX512BWVL-FAST-ALL
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL,AVX512BWVL-FAST-PERLANE
10 define void @shuffle_v64i8_to_v32i8_1(ptr %L, ptr %S) nounwind {
11 ; AVX512F-LABEL: shuffle_v64i8_to_v32i8_1:
13 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
14 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
15 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
16 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
17 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
18 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
19 ; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
20 ; AVX512F-NEXT: vzeroupper
23 ; AVX512BW-LABEL: shuffle_v64i8_to_v32i8_1:
25 ; AVX512BW-NEXT: vpsrlw $8, (%rdi), %zmm0
26 ; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi)
27 ; AVX512BW-NEXT: vzeroupper
30 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8_1:
31 ; AVX512BWVL: # %bb.0:
32 ; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %zmm0
33 ; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi)
34 ; AVX512BWVL-NEXT: vzeroupper
35 ; AVX512BWVL-NEXT: retq
36 %vec = load <64 x i8>, ptr %L
37 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
38 store <32 x i8> %strided.vec, ptr %S
42 define void @shuffle_v32i16_to_v16i16_1(ptr %L, ptr %S) nounwind {
43 ; AVX512-LABEL: shuffle_v32i16_to_v16i16_1:
45 ; AVX512-NEXT: vpsrld $16, (%rdi), %zmm0
46 ; AVX512-NEXT: vpmovdw %zmm0, (%rsi)
47 ; AVX512-NEXT: vzeroupper
49 %vec = load <32 x i16>, ptr %L
50 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
51 store <16 x i16> %strided.vec, ptr %S
55 define void @shuffle_v16i32_to_v8i32_1(ptr %L, ptr %S) nounwind {
56 ; AVX512F-LABEL: shuffle_v16i32_to_v8i32_1:
58 ; AVX512F-NEXT: vmovaps (%rdi), %ymm0
59 ; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],mem[1,3],ymm0[5,7],mem[5,7]
60 ; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
61 ; AVX512F-NEXT: vmovaps %ymm0, (%rsi)
62 ; AVX512F-NEXT: vzeroupper
65 ; AVX512BWVL-FAST-ALL-LABEL: shuffle_v16i32_to_v8i32_1:
66 ; AVX512BWVL-FAST-ALL: # %bb.0:
67 ; AVX512BWVL-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm0
68 ; AVX512BWVL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15]
69 ; AVX512BWVL-FAST-ALL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1
70 ; AVX512BWVL-FAST-ALL-NEXT: vmovdqa %ymm1, (%rsi)
71 ; AVX512BWVL-FAST-ALL-NEXT: vzeroupper
72 ; AVX512BWVL-FAST-ALL-NEXT: retq
74 ; AVX512BWVL-FAST-PERLANE-LABEL: shuffle_v16i32_to_v8i32_1:
75 ; AVX512BWVL-FAST-PERLANE: # %bb.0:
76 ; AVX512BWVL-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm0
77 ; AVX512BWVL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],mem[1,3],ymm0[5,7],mem[5,7]
78 ; AVX512BWVL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
79 ; AVX512BWVL-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rsi)
80 ; AVX512BWVL-FAST-PERLANE-NEXT: vzeroupper
81 ; AVX512BWVL-FAST-PERLANE-NEXT: retq
82 %vec = load <16 x i32>, ptr %L
83 %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
84 store <8 x i32> %strided.vec, ptr %S
88 define void @shuffle_v64i8_to_v16i8_1(ptr %L, ptr %S) nounwind {
89 ; AVX512-LABEL: shuffle_v64i8_to_v16i8_1:
91 ; AVX512-NEXT: vpsrld $8, (%rdi), %zmm0
92 ; AVX512-NEXT: vpmovdb %zmm0, (%rsi)
93 ; AVX512-NEXT: vzeroupper
95 %vec = load <64 x i8>, ptr %L
96 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
97 store <16 x i8> %strided.vec, ptr %S
101 define void @shuffle_v64i8_to_v16i8_2(ptr %L, ptr %S) nounwind {
102 ; AVX512-LABEL: shuffle_v64i8_to_v16i8_2:
104 ; AVX512-NEXT: vpsrld $16, (%rdi), %zmm0
105 ; AVX512-NEXT: vpmovdb %zmm0, (%rsi)
106 ; AVX512-NEXT: vzeroupper
108 %vec = load <64 x i8>, ptr %L
109 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
110 store <16 x i8> %strided.vec, ptr %S
114 define void @shuffle_v64i8_to_v16i8_3(ptr %L, ptr %S) nounwind {
115 ; AVX512-LABEL: shuffle_v64i8_to_v16i8_3:
117 ; AVX512-NEXT: vpsrld $24, (%rdi), %zmm0
118 ; AVX512-NEXT: vpmovdb %zmm0, (%rsi)
119 ; AVX512-NEXT: vzeroupper
121 %vec = load <64 x i8>, ptr %L
122 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
123 store <16 x i8> %strided.vec, ptr %S
127 define void @shuffle_v32i16_to_v8i16_1(ptr %L, ptr %S) nounwind {
128 ; AVX512-LABEL: shuffle_v32i16_to_v8i16_1:
130 ; AVX512-NEXT: vpsrlq $16, (%rdi), %zmm0
131 ; AVX512-NEXT: vpmovqw %zmm0, (%rsi)
132 ; AVX512-NEXT: vzeroupper
134 %vec = load <32 x i16>, ptr %L
135 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
136 store <8 x i16> %strided.vec, ptr %S
140 define void @shuffle_v32i16_to_v8i16_2(ptr %L, ptr %S) nounwind {
141 ; AVX512-LABEL: shuffle_v32i16_to_v8i16_2:
143 ; AVX512-NEXT: vpsrlq $32, (%rdi), %zmm0
144 ; AVX512-NEXT: vpmovqw %zmm0, (%rsi)
145 ; AVX512-NEXT: vzeroupper
147 %vec = load <32 x i16>, ptr %L
148 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
149 store <8 x i16> %strided.vec, ptr %S
153 define void @shuffle_v32i16_to_v8i16_3(ptr %L, ptr %S) nounwind {
154 ; AVX512-LABEL: shuffle_v32i16_to_v8i16_3:
156 ; AVX512-NEXT: vpsrlq $48, (%rdi), %zmm0
157 ; AVX512-NEXT: vpmovqw %zmm0, (%rsi)
158 ; AVX512-NEXT: vzeroupper
160 %vec = load <32 x i16>, ptr %L
161 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
162 store <8 x i16> %strided.vec, ptr %S
166 define void @shuffle_v64i8_to_v8i8_1(ptr %L, ptr %S) nounwind {
167 ; AVX512-LABEL: shuffle_v64i8_to_v8i8_1:
169 ; AVX512-NEXT: vpsrlq $8, (%rdi), %zmm0
170 ; AVX512-NEXT: vpmovqb %zmm0, (%rsi)
171 ; AVX512-NEXT: vzeroupper
173 %vec = load <64 x i8>, ptr %L
174 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57>
175 store <8 x i8> %strided.vec, ptr %S
179 define void @shuffle_v64i8_to_v8i8_2(ptr %L, ptr %S) nounwind {
180 ; AVX512-LABEL: shuffle_v64i8_to_v8i8_2:
182 ; AVX512-NEXT: vpsrlq $16, (%rdi), %zmm0
183 ; AVX512-NEXT: vpmovqb %zmm0, (%rsi)
184 ; AVX512-NEXT: vzeroupper
186 %vec = load <64 x i8>, ptr %L
187 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58>
188 store <8 x i8> %strided.vec, ptr %S
192 define void @shuffle_v64i8_to_v8i8_3(ptr %L, ptr %S) nounwind {
193 ; AVX512-LABEL: shuffle_v64i8_to_v8i8_3:
195 ; AVX512-NEXT: vpsrlq $24, (%rdi), %zmm0
196 ; AVX512-NEXT: vpmovqb %zmm0, (%rsi)
197 ; AVX512-NEXT: vzeroupper
199 %vec = load <64 x i8>, ptr %L
200 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59>
201 store <8 x i8> %strided.vec, ptr %S
205 define void @shuffle_v64i8_to_v8i8_4(ptr %L, ptr %S) nounwind {
206 ; AVX512-LABEL: shuffle_v64i8_to_v8i8_4:
208 ; AVX512-NEXT: vpsrlq $32, (%rdi), %zmm0
209 ; AVX512-NEXT: vpmovqb %zmm0, (%rsi)
210 ; AVX512-NEXT: vzeroupper
212 %vec = load <64 x i8>, ptr %L
213 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60>
214 store <8 x i8> %strided.vec, ptr %S
218 define void @shuffle_v64i8_to_v8i8_5(ptr %L, ptr %S) nounwind {
219 ; AVX512-LABEL: shuffle_v64i8_to_v8i8_5:
221 ; AVX512-NEXT: vpsrlq $40, (%rdi), %zmm0
222 ; AVX512-NEXT: vpmovqb %zmm0, (%rsi)
223 ; AVX512-NEXT: vzeroupper
225 %vec = load <64 x i8>, ptr %L
226 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61>
227 store <8 x i8> %strided.vec, ptr %S
231 define void @shuffle_v64i8_to_v8i8_6(ptr %L, ptr %S) nounwind {
232 ; AVX512-LABEL: shuffle_v64i8_to_v8i8_6:
234 ; AVX512-NEXT: vpsrlq $48, (%rdi), %zmm0
235 ; AVX512-NEXT: vpmovqb %zmm0, (%rsi)
236 ; AVX512-NEXT: vzeroupper
238 %vec = load <64 x i8>, ptr %L
239 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62>
240 store <8 x i8> %strided.vec, ptr %S
244 define void @shuffle_v64i8_to_v8i8_7(ptr %L, ptr %S) nounwind {
245 ; AVX512-LABEL: shuffle_v64i8_to_v8i8_7:
247 ; AVX512-NEXT: vpsrlq $56, (%rdi), %zmm0
248 ; AVX512-NEXT: vpmovqb %zmm0, (%rsi)
249 ; AVX512-NEXT: vzeroupper
251 %vec = load <64 x i8>, ptr %L
252 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63>
253 store <8 x i8> %strided.vec, ptr %S
257 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: