1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
7 define void @shuffle_v64i8_to_v32i8_1(<64 x i8>* %L, <32 x i8>* %S) nounwind {
8 ; AVX512F-LABEL: shuffle_v64i8_to_v32i8_1:
10 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
11 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
12 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
13 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
14 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
15 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
16 ; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
17 ; AVX512F-NEXT: vzeroupper
20 ; AVX512VL-LABEL: shuffle_v64i8_to_v32i8_1:
22 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
23 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
24 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
25 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
26 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
27 ; AVX512VL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
28 ; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi)
29 ; AVX512VL-NEXT: vzeroupper
32 ; AVX512BW-LABEL: shuffle_v64i8_to_v32i8_1:
34 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
35 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1
36 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
37 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
38 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
39 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
40 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
41 ; AVX512BW-NEXT: vzeroupper
44 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8_1:
45 ; AVX512BWVL: # %bb.0:
46 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
47 ; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %ymm1
48 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
49 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
50 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
51 ; AVX512BWVL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
52 ; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi)
53 ; AVX512BWVL-NEXT: vzeroupper
54 ; AVX512BWVL-NEXT: retq
55 %vec = load <64 x i8>, <64 x i8>* %L
56 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
57 store <32 x i8> %strided.vec, <32 x i8>* %S
61 define void @shuffle_v32i16_to_v16i16_1(<32 x i16>* %L, <16 x i16>* %S) nounwind {
62 ; AVX512F-LABEL: shuffle_v32i16_to_v16i16_1:
64 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
65 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
66 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31]
67 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31]
68 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
69 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
70 ; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
71 ; AVX512F-NEXT: vzeroupper
74 ; AVX512VL-LABEL: shuffle_v32i16_to_v16i16_1:
76 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
77 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
78 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31]
79 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31]
80 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
81 ; AVX512VL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
82 ; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi)
83 ; AVX512VL-NEXT: vzeroupper
86 ; AVX512BW-LABEL: shuffle_v32i16_to_v16i16_1:
88 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
89 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1
90 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31]
91 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31]
92 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
93 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
94 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
95 ; AVX512BW-NEXT: vzeroupper
98 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16_1:
99 ; AVX512BWVL: # %bb.0:
100 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
101 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
102 ; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
103 ; AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi)
104 ; AVX512BWVL-NEXT: vzeroupper
105 ; AVX512BWVL-NEXT: retq
106 %vec = load <32 x i16>, <32 x i16>* %L
107 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
108 store <16 x i16> %strided.vec, <16 x i16>* %S
112 define void @shuffle_v16i32_to_v8i32_1(<16 x i32>* %L, <8 x i32>* %S) nounwind {
113 ; AVX512F-LABEL: shuffle_v16i32_to_v8i32_1:
115 ; AVX512F-NEXT: vmovaps (%rdi), %ymm0
116 ; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],mem[1,3],ymm0[5,7],mem[5,7]
117 ; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
118 ; AVX512F-NEXT: vmovaps %ymm0, (%rsi)
119 ; AVX512F-NEXT: vzeroupper
122 ; AVX512VL-LABEL: shuffle_v16i32_to_v8i32_1:
124 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
125 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15]
126 ; AVX512VL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1
127 ; AVX512VL-NEXT: vmovdqa %ymm1, (%rsi)
128 ; AVX512VL-NEXT: vzeroupper
129 ; AVX512VL-NEXT: retq
131 ; AVX512BW-LABEL: shuffle_v16i32_to_v8i32_1:
133 ; AVX512BW-NEXT: vmovaps (%rdi), %ymm0
134 ; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],mem[1,3],ymm0[5,7],mem[5,7]
135 ; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
136 ; AVX512BW-NEXT: vmovaps %ymm0, (%rsi)
137 ; AVX512BW-NEXT: vzeroupper
138 ; AVX512BW-NEXT: retq
140 ; AVX512BWVL-LABEL: shuffle_v16i32_to_v8i32_1:
141 ; AVX512BWVL: # %bb.0:
142 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
143 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15]
144 ; AVX512BWVL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1
145 ; AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi)
146 ; AVX512BWVL-NEXT: vzeroupper
147 ; AVX512BWVL-NEXT: retq
148 %vec = load <16 x i32>, <16 x i32>* %L
149 %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
150 store <8 x i32> %strided.vec, <8 x i32>* %S
154 define void @shuffle_v64i8_to_v16i8_1(<64 x i8>* %L, <16 x i8>* %S) nounwind {
155 ; AVX512-LABEL: shuffle_v64i8_to_v16i8_1:
157 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
158 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
159 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
160 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
161 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
162 ; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
163 ; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
164 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
165 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
166 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
167 ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
168 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
169 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
170 ; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
172 %vec = load <64 x i8>, <64 x i8>* %L
173 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
174 store <16 x i8> %strided.vec, <16 x i8>* %S
178 define void @shuffle_v64i8_to_v16i8_2(<64 x i8>* %L, <16 x i8>* %S) nounwind {
179 ; AVX512-LABEL: shuffle_v64i8_to_v16i8_2:
181 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
182 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
183 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
184 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
185 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
186 ; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
187 ; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
188 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
189 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
190 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
191 ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
192 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
193 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
194 ; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
196 %vec = load <64 x i8>, <64 x i8>* %L
197 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
198 store <16 x i8> %strided.vec, <16 x i8>* %S
202 define void @shuffle_v64i8_to_v16i8_3(<64 x i8>* %L, <16 x i8>* %S) nounwind {
203 ; AVX512-LABEL: shuffle_v64i8_to_v16i8_3:
205 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
206 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
207 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
208 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
209 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
210 ; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
211 ; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
212 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
213 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
214 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
215 ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
216 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
217 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
218 ; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
220 %vec = load <64 x i8>, <64 x i8>* %L
221 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
222 store <16 x i8> %strided.vec, <16 x i8>* %S
226 define void @shuffle_v32i16_to_v8i16_1(<32 x i16>* %L, <8 x i16>* %S) nounwind {
227 ; AVX512F-LABEL: shuffle_v32i16_to_v8i16_1:
229 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
230 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
231 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
232 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
233 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
234 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
235 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
236 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = mem[0,2,2,3]
237 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
238 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
239 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
240 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
243 ; AVX512VL-LABEL: shuffle_v32i16_to_v8i16_1:
245 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
246 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
247 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
248 ; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
249 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
250 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
251 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
252 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
253 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
254 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
255 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
256 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
257 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
258 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
259 ; AVX512VL-NEXT: retq
261 ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_1:
263 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
264 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
265 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
266 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
267 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
268 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
269 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
270 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
271 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
272 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
273 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
274 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
275 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
276 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
277 ; AVX512BW-NEXT: retq
279 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_1:
280 ; AVX512BWVL: # %bb.0:
281 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
282 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,u,u,u,u,u,u,u,u>
283 ; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
284 ; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi)
285 ; AVX512BWVL-NEXT: vzeroupper
286 ; AVX512BWVL-NEXT: retq
287 %vec = load <32 x i16>, <32 x i16>* %L
288 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
289 store <8 x i16> %strided.vec, <8 x i16>* %S
293 define void @shuffle_v32i16_to_v8i16_2(<32 x i16>* %L, <8 x i16>* %S) nounwind {
294 ; AVX512F-LABEL: shuffle_v32i16_to_v8i16_2:
296 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
297 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
298 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
299 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
300 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
301 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
302 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
303 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = mem[3,1,2,3]
304 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
305 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
306 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
307 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
310 ; AVX512VL-LABEL: shuffle_v32i16_to_v8i16_2:
312 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
313 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
314 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
315 ; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
316 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [12,13,14,15,4,5,12,13,8,9,10,11,12,13,14,15]
317 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
318 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
319 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
320 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
321 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
322 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
323 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
324 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
325 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
326 ; AVX512VL-NEXT: retq
328 ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_2:
330 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
331 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
332 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
333 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
334 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [12,13,14,15,4,5,12,13,8,9,10,11,12,13,14,15]
335 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
336 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
337 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
338 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
339 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
340 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
341 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
342 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
343 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
344 ; AVX512BW-NEXT: retq
346 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_2:
347 ; AVX512BWVL: # %bb.0:
348 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
349 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <2,6,10,14,18,22,26,30,u,u,u,u,u,u,u,u>
350 ; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
351 ; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi)
352 ; AVX512BWVL-NEXT: vzeroupper
353 ; AVX512BWVL-NEXT: retq
354 %vec = load <32 x i16>, <32 x i16>* %L
355 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
356 store <8 x i16> %strided.vec, <8 x i16>* %S
360 define void @shuffle_v32i16_to_v8i16_3(<32 x i16>* %L, <8 x i16>* %S) nounwind {
361 ; AVX512F-LABEL: shuffle_v32i16_to_v8i16_3:
363 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
364 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
365 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
366 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
367 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
368 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
369 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
370 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = mem[3,1,2,3]
371 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
372 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
373 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
374 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
377 ; AVX512VL-LABEL: shuffle_v32i16_to_v8i16_3:
379 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
380 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
381 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
382 ; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
383 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
384 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
385 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
386 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
387 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
388 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
389 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
390 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
391 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
392 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
393 ; AVX512VL-NEXT: retq
395 ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_3:
397 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
398 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
399 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
400 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
401 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
402 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
403 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
404 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
405 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
406 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
407 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
408 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
409 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
410 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
411 ; AVX512BW-NEXT: retq
413 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_3:
414 ; AVX512BWVL: # %bb.0:
415 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
416 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <3,7,11,15,19,23,27,31,u,u,u,u,u,u,u,u>
417 ; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
418 ; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi)
419 ; AVX512BWVL-NEXT: vzeroupper
420 ; AVX512BWVL-NEXT: retq
421 %vec = load <32 x i16>, <32 x i16>* %L
422 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
423 store <8 x i16> %strided.vec, <8 x i16>* %S
427 define void @shuffle_v64i8_to_v8i8_1(<64 x i8>* %L, <8 x i8>* %S) nounwind {
428 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8_1:
430 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
431 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
432 ; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
433 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
434 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
435 ; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
436 ; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
437 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
438 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
439 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
440 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
441 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
442 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
443 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
446 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_1:
448 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
449 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
450 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
451 ; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
452 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
453 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
454 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
455 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
456 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
457 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
458 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
459 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
460 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
461 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
462 ; AVX512VL-NEXT: retq
464 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_1:
466 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
467 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
468 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
469 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
470 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
471 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
472 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
473 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
474 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
475 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
476 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
477 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
478 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
479 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
480 ; AVX512BW-NEXT: retq
482 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_1:
483 ; AVX512BWVL: # %bb.0:
484 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
485 ; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
486 ; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2
487 ; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3
488 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = [0,0,1,1,1,1,9,9,8,8,9,9,10,10,11,11]
489 ; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
490 ; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
491 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
492 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,9,9,8,8,9,9,8,8,9,9,10,10,11,11]
493 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
494 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
495 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
496 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
497 ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
498 ; AVX512BWVL-NEXT: retq
499 %vec = load <64 x i8>, <64 x i8>* %L
500 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57>
501 store <8 x i8> %strided.vec, <8 x i8>* %S
505 define void @shuffle_v64i8_to_v8i8_2(<64 x i8>* %L, <8 x i8>* %S) nounwind {
506 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8_2:
508 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
509 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
510 ; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
511 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
512 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
513 ; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
514 ; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
515 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
516 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
517 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
518 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
519 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
520 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
521 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
524 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_2:
526 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
527 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
528 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
529 ; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
530 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
531 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
532 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
533 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
534 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
535 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
536 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
537 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
538 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
539 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
540 ; AVX512VL-NEXT: retq
542 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_2:
544 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
545 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
546 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
547 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
548 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
549 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
550 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
551 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
552 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
553 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
554 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
555 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
556 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
557 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
558 ; AVX512BW-NEXT: retq
560 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_2:
561 ; AVX512BWVL: # %bb.0:
562 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
563 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,u,u,u,u,u,u,u,u>
564 ; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
565 ; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi)
566 ; AVX512BWVL-NEXT: vzeroupper
567 ; AVX512BWVL-NEXT: retq
568 %vec = load <64 x i8>, <64 x i8>* %L
569 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58>
570 store <8 x i8> %strided.vec, <8 x i8>* %S
574 define void @shuffle_v64i8_to_v8i8_3(<64 x i8>* %L, <8 x i8>* %S) nounwind {
575 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8_3:
577 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
578 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
579 ; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
580 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
581 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
582 ; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
583 ; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
584 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
585 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
586 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
587 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
588 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
589 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
590 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
593 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_3:
595 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
596 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
597 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
598 ; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
599 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
600 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
601 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
602 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
603 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
604 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
605 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
606 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
607 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
608 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
609 ; AVX512VL-NEXT: retq
611 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_3:
613 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
614 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
615 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
616 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
617 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
618 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
619 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
620 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
621 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
622 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
623 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
624 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
625 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
626 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
627 ; AVX512BW-NEXT: retq
629 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_3:
630 ; AVX512BWVL: # %bb.0:
631 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
632 ; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
633 ; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2
634 ; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3
635 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = [10,10,11,11,3,3,11,11,8,8,9,9,10,10,11,11]
636 ; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
637 ; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
638 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
639 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [3,3,11,11,2,2,3,3,8,8,9,9,10,10,11,11]
640 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
641 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
642 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
643 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
644 ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
645 ; AVX512BWVL-NEXT: retq
646 %vec = load <64 x i8>, <64 x i8>* %L
647 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59>
648 store <8 x i8> %strided.vec, <8 x i8>* %S
652 define void @shuffle_v64i8_to_v8i8_4(<64 x i8>* %L, <8 x i8>* %S) nounwind {
653 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8_4:
655 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
656 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
657 ; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
658 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
659 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
660 ; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
661 ; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
662 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
663 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
664 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
665 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
666 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
667 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
668 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
671 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_4:
673 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
674 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
675 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
676 ; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
677 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
678 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
679 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
680 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
681 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
682 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
683 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
684 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
685 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
686 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
687 ; AVX512VL-NEXT: retq
689 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_4:
691 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
692 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
693 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
694 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
695 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
696 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
697 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
698 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
699 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
700 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
701 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
702 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
703 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
704 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
705 ; AVX512BW-NEXT: retq
707 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_4:
708 ; AVX512BWVL: # %bb.0:
709 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
710 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <2,6,10,14,18,22,26,30,u,u,u,u,u,u,u,u>
711 ; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
712 ; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi)
713 ; AVX512BWVL-NEXT: vzeroupper
714 ; AVX512BWVL-NEXT: retq
715 %vec = load <64 x i8>, <64 x i8>* %L
716 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60>
717 store <8 x i8> %strided.vec, <8 x i8>* %S
721 define void @shuffle_v64i8_to_v8i8_5(<64 x i8>* %L, <8 x i8>* %S) nounwind {
722 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8_5:
724 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
725 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
726 ; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
727 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
728 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
729 ; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
730 ; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
731 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
732 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
733 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
734 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
735 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
736 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
737 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
740 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_5:
742 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
743 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
744 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
745 ; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
746 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
747 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
748 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
749 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
750 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
751 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
752 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
753 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
754 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
755 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
756 ; AVX512VL-NEXT: retq
758 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_5:
760 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
761 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
762 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
763 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
764 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
765 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
766 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
767 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
768 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
769 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
770 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
771 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
772 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
773 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
774 ; AVX512BW-NEXT: retq
776 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_5:
777 ; AVX512BWVL: # %bb.0:
778 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
779 ; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
780 ; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2
781 ; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3
782 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = [12,12,13,13,5,5,13,13,4,4,5,5,6,6,7,7]
783 ; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
784 ; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
785 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
786 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [5,5,13,13,4,4,5,5,4,4,5,5,6,6,7,7]
787 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
788 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
789 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
790 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
791 ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
792 ; AVX512BWVL-NEXT: retq
793 %vec = load <64 x i8>, <64 x i8>* %L
794 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61>
795 store <8 x i8> %strided.vec, <8 x i8>* %S
799 define void @shuffle_v64i8_to_v8i8_6(<64 x i8>* %L, <8 x i8>* %S) nounwind {
800 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8_6:
802 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
803 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
804 ; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
805 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
806 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
807 ; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
808 ; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
809 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
810 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
811 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
812 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
813 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
814 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
815 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
818 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_6:
820 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
821 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
822 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
823 ; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
824 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
825 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
826 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
827 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
828 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
829 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
830 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
831 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
832 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
833 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
834 ; AVX512VL-NEXT: retq
836 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_6:
838 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
839 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
840 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
841 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
842 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
843 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
844 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
845 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
846 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
847 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
848 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
849 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
850 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
851 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
852 ; AVX512BW-NEXT: retq
854 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_6:
855 ; AVX512BWVL: # %bb.0:
856 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
857 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <3,7,11,15,19,23,27,31,u,u,u,u,u,u,u,u>
858 ; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
859 ; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi)
860 ; AVX512BWVL-NEXT: vzeroupper
861 ; AVX512BWVL-NEXT: retq
862 %vec = load <64 x i8>, <64 x i8>* %L
863 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62>
864 store <8 x i8> %strided.vec, <8 x i8>* %S
868 define void @shuffle_v64i8_to_v8i8_7(<64 x i8>* %L, <8 x i8>* %S) nounwind {
869 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8_7:
871 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
872 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
873 ; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
874 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
875 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
876 ; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
877 ; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
878 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
879 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
880 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
881 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
882 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
883 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
884 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
887 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_7:
889 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
890 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
891 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
892 ; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
893 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
894 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
895 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
896 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
897 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
898 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
899 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
900 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
901 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
902 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
903 ; AVX512VL-NEXT: retq
905 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_7:
907 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
908 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
909 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
910 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
911 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
912 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
913 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
914 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
915 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
916 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
917 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
918 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
919 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
920 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
921 ; AVX512BW-NEXT: retq
923 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_7:
924 ; AVX512BWVL: # %bb.0:
925 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
926 ; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
927 ; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2
928 ; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3
929 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = [14,14,15,15,7,7,15,15,4,4,5,5,6,6,7,7]
930 ; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
931 ; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
932 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
933 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,15,15,6,6,7,7,4,4,5,5,6,6,7,7]
934 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
935 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
936 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
937 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
938 ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
939 ; AVX512BWVL-NEXT: retq
940 %vec = load <64 x i8>, <64 x i8>* %L
941 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63>
942 store <8 x i8> %strided.vec, <8 x i8>* %S