1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
7 define void @shuffle_v64i8_to_v32i8_1(<64 x i8>* %L, <32 x i8>* %S) nounwind {
8 ; AVX512F-LABEL: shuffle_v64i8_to_v32i8_1:
10 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
11 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
12 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
13 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
14 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
15 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
16 ; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
17 ; AVX512F-NEXT: vzeroupper
20 ; AVX512VL-LABEL: shuffle_v64i8_to_v32i8_1:
22 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
23 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
24 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
25 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
26 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
27 ; AVX512VL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
28 ; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi)
29 ; AVX512VL-NEXT: vzeroupper
32 ; AVX512BW-LABEL: shuffle_v64i8_to_v32i8_1:
34 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
35 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1
36 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
37 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
38 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
39 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
40 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
41 ; AVX512BW-NEXT: vzeroupper
44 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8_1:
45 ; AVX512BWVL: # %bb.0:
46 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
47 ; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %ymm1
48 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
49 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
50 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
51 ; AVX512BWVL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
52 ; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi)
53 ; AVX512BWVL-NEXT: vzeroupper
54 ; AVX512BWVL-NEXT: retq
55 %vec = load <64 x i8>, <64 x i8>* %L
56 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
57 store <32 x i8> %strided.vec, <32 x i8>* %S
61 define void @shuffle_v32i16_to_v16i16_1(<32 x i16>* %L, <16 x i16>* %S) nounwind {
62 ; AVX512F-LABEL: shuffle_v32i16_to_v16i16_1:
64 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
65 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
66 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31]
67 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31]
68 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
69 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
70 ; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
71 ; AVX512F-NEXT: vzeroupper
74 ; AVX512VL-LABEL: shuffle_v32i16_to_v16i16_1:
76 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
77 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
78 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31]
79 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31]
80 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
81 ; AVX512VL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
82 ; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi)
83 ; AVX512VL-NEXT: vzeroupper
86 ; AVX512BW-LABEL: shuffle_v32i16_to_v16i16_1:
88 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
89 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1
90 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31]
91 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31]
92 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
93 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
94 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
95 ; AVX512BW-NEXT: vzeroupper
98 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16_1:
99 ; AVX512BWVL: # %bb.0:
100 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
101 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
102 ; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
103 ; AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi)
104 ; AVX512BWVL-NEXT: vzeroupper
105 ; AVX512BWVL-NEXT: retq
106 %vec = load <32 x i16>, <32 x i16>* %L
107 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
108 store <16 x i16> %strided.vec, <16 x i16>* %S
112 define void @shuffle_v16i32_to_v8i32_1(<16 x i32>* %L, <8 x i32>* %S) nounwind {
113 ; AVX512F-LABEL: shuffle_v16i32_to_v8i32_1:
115 ; AVX512F-NEXT: vmovaps (%rdi), %ymm0
116 ; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],mem[1,3],ymm0[5,7],mem[5,7]
117 ; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
118 ; AVX512F-NEXT: vmovaps %ymm0, (%rsi)
119 ; AVX512F-NEXT: vzeroupper
122 ; AVX512VL-LABEL: shuffle_v16i32_to_v8i32_1:
124 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
125 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15]
126 ; AVX512VL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1
127 ; AVX512VL-NEXT: vmovdqa %ymm1, (%rsi)
128 ; AVX512VL-NEXT: vzeroupper
129 ; AVX512VL-NEXT: retq
131 ; AVX512BW-LABEL: shuffle_v16i32_to_v8i32_1:
133 ; AVX512BW-NEXT: vmovaps (%rdi), %ymm0
134 ; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],mem[1,3],ymm0[5,7],mem[5,7]
135 ; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
136 ; AVX512BW-NEXT: vmovaps %ymm0, (%rsi)
137 ; AVX512BW-NEXT: vzeroupper
138 ; AVX512BW-NEXT: retq
140 ; AVX512BWVL-LABEL: shuffle_v16i32_to_v8i32_1:
141 ; AVX512BWVL: # %bb.0:
142 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
143 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15]
144 ; AVX512BWVL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1
145 ; AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi)
146 ; AVX512BWVL-NEXT: vzeroupper
147 ; AVX512BWVL-NEXT: retq
148 %vec = load <16 x i32>, <16 x i32>* %L
149 %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
150 store <8 x i32> %strided.vec, <8 x i32>* %S
154 define void @shuffle_v64i8_to_v16i8_1(<64 x i8>* %L, <16 x i8>* %S) nounwind {
155 ; AVX512-LABEL: shuffle_v64i8_to_v16i8_1:
157 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
158 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
159 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
160 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
161 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
162 ; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
163 ; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
164 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
165 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
166 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
167 ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
168 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
169 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
170 ; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
172 %vec = load <64 x i8>, <64 x i8>* %L
173 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
174 store <16 x i8> %strided.vec, <16 x i8>* %S
178 define void @shuffle_v64i8_to_v16i8_2(<64 x i8>* %L, <16 x i8>* %S) nounwind {
179 ; AVX512-LABEL: shuffle_v64i8_to_v16i8_2:
181 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
182 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
183 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
184 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
185 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
186 ; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
187 ; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
188 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
189 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
190 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
191 ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
192 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
193 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
194 ; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
196 %vec = load <64 x i8>, <64 x i8>* %L
197 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
198 store <16 x i8> %strided.vec, <16 x i8>* %S
202 define void @shuffle_v64i8_to_v16i8_3(<64 x i8>* %L, <16 x i8>* %S) nounwind {
203 ; AVX512-LABEL: shuffle_v64i8_to_v16i8_3:
205 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
206 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
207 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
208 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
209 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
210 ; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
211 ; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
212 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
213 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
214 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
215 ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
216 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
217 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
218 ; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
220 %vec = load <64 x i8>, <64 x i8>* %L
221 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
222 store <16 x i8> %strided.vec, <16 x i8>* %S
226 define void @shuffle_v32i16_to_v8i16_1(<32 x i16>* %L, <8 x i16>* %S) nounwind {
227 ; AVX512F-LABEL: shuffle_v32i16_to_v8i16_1:
229 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
230 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
231 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
232 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
233 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
234 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
235 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
236 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = mem[0,2,2,3]
237 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
238 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
239 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
240 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
243 ; AVX512VL-LABEL: shuffle_v32i16_to_v8i16_1:
245 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
246 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
247 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
248 ; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
249 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
250 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
251 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
252 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
253 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
254 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
255 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
256 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
257 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
258 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
259 ; AVX512VL-NEXT: retq
261 ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_1:
263 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
264 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
265 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
266 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
267 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
268 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
269 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
270 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
271 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
272 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
273 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
274 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
275 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
276 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
277 ; AVX512BW-NEXT: retq
279 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_1:
280 ; AVX512BWVL: # %bb.0:
281 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [1,5,9,13,17,21,25,29]
282 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
283 ; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
284 ; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi)
285 ; AVX512BWVL-NEXT: vzeroupper
286 ; AVX512BWVL-NEXT: retq
287 %vec = load <32 x i16>, <32 x i16>* %L
288 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
289 store <8 x i16> %strided.vec, <8 x i16>* %S
293 define void @shuffle_v32i16_to_v8i16_2(<32 x i16>* %L, <8 x i16>* %S) nounwind {
294 ; AVX512F-LABEL: shuffle_v32i16_to_v8i16_2:
296 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
297 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
298 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
299 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
300 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
301 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
302 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
303 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = mem[3,1,2,3]
304 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
305 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
306 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
307 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
310 ; AVX512VL-LABEL: shuffle_v32i16_to_v8i16_2:
312 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
313 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
314 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
315 ; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
316 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [12,13,14,15,4,5,12,13,8,9,10,11,12,13,14,15]
317 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
318 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
319 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
320 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
321 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
322 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
323 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
324 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
325 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
326 ; AVX512VL-NEXT: retq
328 ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_2:
330 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
331 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
332 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
333 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
334 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [12,13,14,15,4,5,12,13,8,9,10,11,12,13,14,15]
335 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
336 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
337 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
338 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
339 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
340 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
341 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
342 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
343 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
344 ; AVX512BW-NEXT: retq
346 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_2:
347 ; AVX512BWVL: # %bb.0:
348 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [2,6,10,14,18,22,26,30]
349 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
350 ; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
351 ; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi)
352 ; AVX512BWVL-NEXT: vzeroupper
353 ; AVX512BWVL-NEXT: retq
354 %vec = load <32 x i16>, <32 x i16>* %L
355 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
356 store <8 x i16> %strided.vec, <8 x i16>* %S
360 define void @shuffle_v32i16_to_v8i16_3(<32 x i16>* %L, <8 x i16>* %S) nounwind {
361 ; AVX512F-LABEL: shuffle_v32i16_to_v8i16_3:
363 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
364 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
365 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
366 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
367 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
368 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
369 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
370 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = mem[3,1,2,3]
371 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
372 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
373 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
374 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
377 ; AVX512VL-LABEL: shuffle_v32i16_to_v8i16_3:
379 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
380 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
381 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
382 ; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
383 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
384 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
385 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
386 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
387 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
388 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
389 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
390 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
391 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
392 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
393 ; AVX512VL-NEXT: retq
395 ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_3:
397 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
398 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
399 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
400 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
401 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
402 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
403 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
404 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
405 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
406 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
407 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
408 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
409 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
410 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
411 ; AVX512BW-NEXT: retq
413 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_3:
414 ; AVX512BWVL: # %bb.0:
415 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [3,7,11,15,19,23,27,31]
416 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
417 ; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
418 ; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi)
419 ; AVX512BWVL-NEXT: vzeroupper
420 ; AVX512BWVL-NEXT: retq
421 %vec = load <32 x i16>, <32 x i16>* %L
422 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
423 store <8 x i16> %strided.vec, <8 x i16>* %S
427 define void @shuffle_v64i8_to_v8i8_1(<64 x i8>* %L, <8 x i8>* %S) nounwind {
428 ; AVX512-LABEL: shuffle_v64i8_to_v8i8_1:
430 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
431 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
432 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
433 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
434 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
435 ; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
436 ; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
437 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
438 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
439 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
440 ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
441 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
442 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
443 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
445 %vec = load <64 x i8>, <64 x i8>* %L
446 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57>
447 store <8 x i8> %strided.vec, <8 x i8>* %S
451 define void @shuffle_v64i8_to_v8i8_2(<64 x i8>* %L, <8 x i8>* %S) nounwind {
452 ; AVX512-LABEL: shuffle_v64i8_to_v8i8_2:
454 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
455 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
456 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
457 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
458 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
459 ; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
460 ; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
461 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
462 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
463 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
464 ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
465 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
466 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
467 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
469 %vec = load <64 x i8>, <64 x i8>* %L
470 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58>
471 store <8 x i8> %strided.vec, <8 x i8>* %S
475 define void @shuffle_v64i8_to_v8i8_3(<64 x i8>* %L, <8 x i8>* %S) nounwind {
476 ; AVX512-LABEL: shuffle_v64i8_to_v8i8_3:
478 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
479 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
480 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
481 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
482 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
483 ; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
484 ; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
485 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
486 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
487 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
488 ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
489 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
490 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
491 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
493 %vec = load <64 x i8>, <64 x i8>* %L
494 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59>
495 store <8 x i8> %strided.vec, <8 x i8>* %S
499 define void @shuffle_v64i8_to_v8i8_4(<64 x i8>* %L, <8 x i8>* %S) nounwind {
500 ; AVX512-LABEL: shuffle_v64i8_to_v8i8_4:
502 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
503 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
504 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
505 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
506 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
507 ; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
508 ; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
509 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
510 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
511 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
512 ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
513 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
514 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
515 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
517 %vec = load <64 x i8>, <64 x i8>* %L
518 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60>
519 store <8 x i8> %strided.vec, <8 x i8>* %S
523 define void @shuffle_v64i8_to_v8i8_5(<64 x i8>* %L, <8 x i8>* %S) nounwind {
524 ; AVX512-LABEL: shuffle_v64i8_to_v8i8_5:
526 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
527 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
528 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
529 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
530 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
531 ; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
532 ; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
533 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
534 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
535 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
536 ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
537 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
538 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
539 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
541 %vec = load <64 x i8>, <64 x i8>* %L
542 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61>
543 store <8 x i8> %strided.vec, <8 x i8>* %S
547 define void @shuffle_v64i8_to_v8i8_6(<64 x i8>* %L, <8 x i8>* %S) nounwind {
548 ; AVX512-LABEL: shuffle_v64i8_to_v8i8_6:
550 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
551 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
552 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
553 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
554 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
555 ; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
556 ; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
557 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
558 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
559 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
560 ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
561 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
562 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
563 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
565 %vec = load <64 x i8>, <64 x i8>* %L
566 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62>
567 store <8 x i8> %strided.vec, <8 x i8>* %S
571 define void @shuffle_v64i8_to_v8i8_7(<64 x i8>* %L, <8 x i8>* %S) nounwind {
572 ; AVX512-LABEL: shuffle_v64i8_to_v8i8_7:
574 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
575 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
576 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
577 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
578 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
579 ; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
580 ; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
581 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
582 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
583 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
584 ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
585 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
586 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
587 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
589 %vec = load <64 x i8>, <64 x i8>* %L
590 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63>
591 store <8 x i8> %strided.vec, <8 x i8>* %S