1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
9 define void @shuffle_v32i8_to_v16i8_1(<32 x i8>* %L, <16 x i8>* %S) nounwind {
10 ; AVX1-LABEL: shuffle_v32i8_to_v16i8_1:
12 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0
13 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
14 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
15 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
16 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
17 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
18 ; AVX1-NEXT: vmovdqa %xmm0, (%rsi)
19 ; AVX1-NEXT: vzeroupper
22 ; AVX2-LABEL: shuffle_v32i8_to_v16i8_1:
24 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
25 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
26 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
27 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
28 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
29 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
30 ; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
31 ; AVX2-NEXT: vzeroupper
34 ; AVX512-LABEL: shuffle_v32i8_to_v16i8_1:
36 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0
37 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
38 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
39 ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
40 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
41 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
42 ; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
43 ; AVX512-NEXT: vzeroupper
45 %vec = load <32 x i8>, <32 x i8>* %L
46 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
47 store <16 x i8> %strided.vec, <16 x i8>* %S
51 define void @shuffle_v16i16_to_v8i16_1(<16 x i16>* %L, <8 x i16>* %S) nounwind {
52 ; AVX1-LABEL: shuffle_v16i16_to_v8i16_1:
54 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0
55 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
56 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
57 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
58 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
59 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
60 ; AVX1-NEXT: vmovdqa %xmm0, (%rsi)
61 ; AVX1-NEXT: vzeroupper
64 ; AVX2-LABEL: shuffle_v16i16_to_v8i16_1:
66 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
67 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
68 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
69 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
70 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
71 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
72 ; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
73 ; AVX2-NEXT: vzeroupper
76 ; AVX512-LABEL: shuffle_v16i16_to_v8i16_1:
78 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0
79 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
80 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
81 ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
82 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
83 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
84 ; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
85 ; AVX512-NEXT: vzeroupper
87 %vec = load <16 x i16>, <16 x i16>* %L
88 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
89 store <8 x i16> %strided.vec, <8 x i16>* %S
93 define void @shuffle_v8i32_to_v4i32_1(<8 x i32>* %L, <4 x i32>* %S) nounwind {
94 ; AVX-LABEL: shuffle_v8i32_to_v4i32_1:
96 ; AVX-NEXT: vmovaps (%rdi), %ymm0
97 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
98 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
99 ; AVX-NEXT: vmovaps %xmm0, (%rsi)
100 ; AVX-NEXT: vzeroupper
103 ; AVX512-LABEL: shuffle_v8i32_to_v4i32_1:
105 ; AVX512-NEXT: vmovaps (%rdi), %ymm0
106 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
107 ; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
108 ; AVX512-NEXT: vmovaps %xmm0, (%rsi)
109 ; AVX512-NEXT: vzeroupper
111 %vec = load <8 x i32>, <8 x i32>* %L
112 %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
113 store <4 x i32> %strided.vec, <4 x i32>* %S
117 define void @shuffle_v32i8_to_v8i8_1(<32 x i8>* %L, <8 x i8>* %S) nounwind {
118 ; AVX1-LABEL: shuffle_v32i8_to_v8i8_1:
120 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0
121 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
122 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
123 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
124 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
125 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
126 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
127 ; AVX1-NEXT: vzeroupper
130 ; AVX2-LABEL: shuffle_v32i8_to_v8i8_1:
132 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
133 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
134 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
135 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
136 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
137 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
138 ; AVX2-NEXT: vmovq %xmm0, (%rsi)
139 ; AVX2-NEXT: vzeroupper
142 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_1:
144 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
145 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
146 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
147 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
148 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
149 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
150 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
151 ; AVX512F-NEXT: vzeroupper
154 ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_1:
156 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
157 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
158 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
159 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
160 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
161 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
162 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
163 ; AVX512VL-NEXT: vzeroupper
164 ; AVX512VL-NEXT: retq
166 ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_1:
168 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
169 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
170 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
171 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
172 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
173 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
174 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
175 ; AVX512BW-NEXT: vzeroupper
176 ; AVX512BW-NEXT: retq
178 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_1:
179 ; AVX512BWVL: # BB#0:
180 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
181 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
182 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,5,5,9,9,13,13,13,13,5,5,12,12,13,13]
183 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
184 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
185 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
186 ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
187 ; AVX512BWVL-NEXT: vzeroupper
188 ; AVX512BWVL-NEXT: retq
189 %vec = load <32 x i8>, <32 x i8>* %L
190 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
191 store <8 x i8> %strided.vec, <8 x i8>* %S
195 define void @shuffle_v32i8_to_v8i8_2(<32 x i8>* %L, <8 x i8>* %S) nounwind {
196 ; AVX1-LABEL: shuffle_v32i8_to_v8i8_2:
198 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0
199 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
200 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
201 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
202 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
203 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
204 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
205 ; AVX1-NEXT: vzeroupper
208 ; AVX2-LABEL: shuffle_v32i8_to_v8i8_2:
210 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
211 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
212 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
213 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
214 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
215 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
216 ; AVX2-NEXT: vmovq %xmm0, (%rsi)
217 ; AVX2-NEXT: vzeroupper
220 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_2:
222 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
223 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
224 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
225 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
226 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
227 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
228 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
229 ; AVX512F-NEXT: vzeroupper
232 ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_2:
234 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
235 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
236 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
237 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
238 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
239 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
240 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
241 ; AVX512VL-NEXT: vzeroupper
242 ; AVX512VL-NEXT: retq
244 ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_2:
246 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
247 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
248 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
249 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
250 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
251 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
252 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
253 ; AVX512BW-NEXT: vzeroupper
254 ; AVX512BW-NEXT: retq
256 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_2:
257 ; AVX512BWVL: # BB#0:
258 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
259 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
260 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
261 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
262 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
263 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
264 ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
265 ; AVX512BWVL-NEXT: vzeroupper
266 ; AVX512BWVL-NEXT: retq
267 %vec = load <32 x i8>, <32 x i8>* %L
268 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
269 store <8 x i8> %strided.vec, <8 x i8>* %S
273 define void @shuffle_v32i8_to_v8i8_3(<32 x i8>* %L, <8 x i8>* %S) nounwind {
274 ; AVX1-LABEL: shuffle_v32i8_to_v8i8_3:
276 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0
277 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
278 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
279 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
280 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
281 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
282 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
283 ; AVX1-NEXT: vzeroupper
286 ; AVX2-LABEL: shuffle_v32i8_to_v8i8_3:
288 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
289 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
290 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
291 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
292 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
293 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
294 ; AVX2-NEXT: vmovq %xmm0, (%rsi)
295 ; AVX2-NEXT: vzeroupper
298 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_3:
300 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
301 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
302 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
303 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
304 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
305 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
306 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
307 ; AVX512F-NEXT: vzeroupper
310 ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_3:
312 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
313 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
314 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
315 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
316 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
317 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
318 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
319 ; AVX512VL-NEXT: vzeroupper
320 ; AVX512VL-NEXT: retq
322 ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_3:
324 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
325 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
326 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
327 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
328 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
329 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
330 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
331 ; AVX512BW-NEXT: vzeroupper
332 ; AVX512BW-NEXT: retq
334 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_3:
335 ; AVX512BWVL: # BB#0:
336 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
337 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
338 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,7,7,11,11,15,15,7,7,15,15,6,6,7,7]
339 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
340 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
341 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
342 ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
343 ; AVX512BWVL-NEXT: vzeroupper
344 ; AVX512BWVL-NEXT: retq
345 %vec = load <32 x i8>, <32 x i8>* %L
346 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
347 store <8 x i8> %strided.vec, <8 x i8>* %S
351 define void @shuffle_v16i16_to_v4i16_1(<16 x i16>* %L, <4 x i16>* %S) nounwind {
352 ; AVX1-LABEL: shuffle_v16i16_to_v4i16_1:
354 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0
355 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
356 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
357 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
358 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
359 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
360 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
361 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
362 ; AVX1-NEXT: vzeroupper
365 ; AVX2-LABEL: shuffle_v16i16_to_v4i16_1:
367 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
368 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
369 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
370 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
371 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
372 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
373 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
374 ; AVX2-NEXT: vmovq %xmm0, (%rsi)
375 ; AVX2-NEXT: vzeroupper
378 ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_1:
380 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
381 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
382 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
383 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
384 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
385 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
386 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
387 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
388 ; AVX512F-NEXT: vzeroupper
391 ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_1:
393 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
394 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
395 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
396 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7]
397 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
398 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
399 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
400 ; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
401 ; AVX512VL-NEXT: vzeroupper
402 ; AVX512VL-NEXT: retq
404 ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_1:
406 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
407 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
408 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
409 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
410 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
411 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
412 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
413 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
414 ; AVX512BW-NEXT: vzeroupper
415 ; AVX512BW-NEXT: retq
417 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_1:
418 ; AVX512BWVL: # BB#0:
419 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
420 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
421 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
422 ; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7]
423 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
424 ; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
425 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
426 ; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
427 ; AVX512BWVL-NEXT: vzeroupper
428 ; AVX512BWVL-NEXT: retq
429 %vec = load <16 x i16>, <16 x i16>* %L
430 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
431 store <4 x i16> %strided.vec, <4 x i16>* %S
435 define void @shuffle_v16i16_to_v4i16_2(<16 x i16>* %L, <4 x i16>* %S) nounwind {
436 ; AVX1-LABEL: shuffle_v16i16_to_v4i16_2:
438 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0
439 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
440 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
441 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
442 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
443 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
444 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
445 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
446 ; AVX1-NEXT: vzeroupper
449 ; AVX2-LABEL: shuffle_v16i16_to_v4i16_2:
451 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
452 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
453 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
454 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
455 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
456 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
457 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
458 ; AVX2-NEXT: vmovq %xmm0, (%rsi)
459 ; AVX2-NEXT: vzeroupper
462 ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_2:
464 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
465 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
466 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
467 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
468 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
469 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
470 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
471 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
472 ; AVX512F-NEXT: vzeroupper
475 ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_2:
477 ; AVX512VL-NEXT: vmovaps (%rdi), %ymm0
478 ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
479 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
480 ; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
481 ; AVX512VL-NEXT: vzeroupper
482 ; AVX512VL-NEXT: retq
484 ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_2:
486 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
487 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
488 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
489 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
490 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
491 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
492 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
493 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
494 ; AVX512BW-NEXT: vzeroupper
495 ; AVX512BW-NEXT: retq
497 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_2:
498 ; AVX512BWVL: # BB#0:
499 ; AVX512BWVL-NEXT: vmovaps (%rdi), %ymm0
500 ; AVX512BWVL-NEXT: vextractf128 $1, %ymm0, %xmm1
501 ; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
502 ; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
503 ; AVX512BWVL-NEXT: vzeroupper
504 ; AVX512BWVL-NEXT: retq
505 %vec = load <16 x i16>, <16 x i16>* %L
506 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
507 store <4 x i16> %strided.vec, <4 x i16>* %S
511 define void @shuffle_v16i16_to_v4i16_3(<16 x i16>* %L, <4 x i16>* %S) nounwind {
512 ; AVX1-LABEL: shuffle_v16i16_to_v4i16_3:
514 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0
515 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
516 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
517 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
518 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
519 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
520 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
521 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
522 ; AVX1-NEXT: vzeroupper
525 ; AVX2-LABEL: shuffle_v16i16_to_v4i16_3:
527 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
528 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
529 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
530 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
531 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
532 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
533 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
534 ; AVX2-NEXT: vmovq %xmm0, (%rsi)
535 ; AVX2-NEXT: vzeroupper
538 ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_3:
540 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
541 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
542 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
543 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
544 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
545 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
546 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
547 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
548 ; AVX512F-NEXT: vzeroupper
551 ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_3:
553 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
554 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
555 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
556 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
557 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
558 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
559 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
560 ; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
561 ; AVX512VL-NEXT: vzeroupper
562 ; AVX512VL-NEXT: retq
564 ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_3:
566 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
567 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
568 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
569 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
570 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
571 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
572 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
573 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
574 ; AVX512BW-NEXT: vzeroupper
575 ; AVX512BW-NEXT: retq
577 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_3:
578 ; AVX512BWVL: # BB#0:
579 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
580 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
581 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
582 ; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
583 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
584 ; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
585 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
586 ; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
587 ; AVX512BWVL-NEXT: vzeroupper
588 ; AVX512BWVL-NEXT: retq
589 %vec = load <16 x i16>, <16 x i16>* %L
590 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
591 store <4 x i16> %strided.vec, <4 x i16>* %S
595 define void @shuffle_v32i8_to_v4i8_1(<32 x i8>* %L, <4 x i8>* %S) nounwind {
596 ; AVX1-LABEL: shuffle_v32i8_to_v4i8_1:
598 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0
599 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
600 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
601 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
602 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
603 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
604 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
605 ; AVX1-NEXT: vzeroupper
608 ; AVX2-LABEL: shuffle_v32i8_to_v4i8_1:
610 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
611 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
612 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
613 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
614 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
615 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
616 ; AVX2-NEXT: vmovd %xmm0, (%rsi)
617 ; AVX2-NEXT: vzeroupper
620 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_1:
622 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
623 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
624 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
625 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
626 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
627 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
628 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
629 ; AVX512F-NEXT: vzeroupper
632 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_1:
634 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
635 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
636 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,1,1,8,8,9,9,8,8,9,9,10,10,11,11]
637 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
638 ; AVX512VL-NEXT: vpsrld $16, %xmm1, %xmm1
639 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
640 ; AVX512VL-NEXT: vpsrld $16, %xmm0, %xmm0
641 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
642 ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
643 ; AVX512VL-NEXT: vzeroupper
644 ; AVX512VL-NEXT: retq
646 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_1:
648 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
649 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
650 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
651 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
652 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
653 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
654 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
655 ; AVX512BW-NEXT: vzeroupper
656 ; AVX512BW-NEXT: retq
658 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_1:
659 ; AVX512BWVL: # BB#0:
660 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
661 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
662 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,1,1,8,8,9,9,8,8,9,9,10,10,11,11]
663 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
664 ; AVX512BWVL-NEXT: vpsrld $16, %xmm1, %xmm1
665 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
666 ; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm0
667 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
668 ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
669 ; AVX512BWVL-NEXT: vzeroupper
670 ; AVX512BWVL-NEXT: retq
671 %vec = load <32 x i8>, <32 x i8>* %L
672 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 1, i32 9, i32 17, i32 25>
673 store <4 x i8> %strided.vec, <4 x i8>* %S
677 define void @shuffle_v32i8_to_v4i8_2(<32 x i8>* %L, <4 x i8>* %S) nounwind {
678 ; AVX1-LABEL: shuffle_v32i8_to_v4i8_2:
680 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0
681 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
682 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
683 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
684 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
685 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
686 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
687 ; AVX1-NEXT: vzeroupper
690 ; AVX2-LABEL: shuffle_v32i8_to_v4i8_2:
692 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
693 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
694 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
695 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
696 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
697 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
698 ; AVX2-NEXT: vmovd %xmm0, (%rsi)
699 ; AVX2-NEXT: vzeroupper
702 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_2:
704 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
705 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
706 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
707 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
708 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
709 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
710 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
711 ; AVX512F-NEXT: vzeroupper
714 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_2:
716 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
717 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
718 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
719 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7]
720 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
721 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
722 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
723 ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
724 ; AVX512VL-NEXT: vzeroupper
725 ; AVX512VL-NEXT: retq
727 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_2:
729 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
730 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
731 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
732 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
733 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
734 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
735 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
736 ; AVX512BW-NEXT: vzeroupper
737 ; AVX512BW-NEXT: retq
739 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_2:
740 ; AVX512BWVL: # BB#0:
741 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
742 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
743 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
744 ; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7]
745 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
746 ; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
747 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
748 ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
749 ; AVX512BWVL-NEXT: vzeroupper
750 ; AVX512BWVL-NEXT: retq
751 %vec = load <32 x i8>, <32 x i8>* %L
752 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 2, i32 10, i32 18, i32 26>
753 store <4 x i8> %strided.vec, <4 x i8>* %S
757 define void @shuffle_v32i8_to_v4i8_3(<32 x i8>* %L, <4 x i8>* %S) nounwind {
758 ; AVX1-LABEL: shuffle_v32i8_to_v4i8_3:
760 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0
761 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
762 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
763 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
764 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
765 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
766 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
767 ; AVX1-NEXT: vzeroupper
770 ; AVX2-LABEL: shuffle_v32i8_to_v4i8_3:
772 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
773 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
774 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
775 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
776 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
777 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
778 ; AVX2-NEXT: vmovd %xmm0, (%rsi)
779 ; AVX2-NEXT: vzeroupper
782 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_3:
784 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
785 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
786 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
787 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
788 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
789 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
790 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
791 ; AVX512F-NEXT: vzeroupper
794 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_3:
796 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
797 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
798 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [10,10,11,11,2,2,3,3,8,8,9,9,10,10,11,11]
799 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
800 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
801 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
802 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
803 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
804 ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
805 ; AVX512VL-NEXT: vzeroupper
806 ; AVX512VL-NEXT: retq
808 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_3:
810 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
811 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
812 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
813 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
814 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
815 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
816 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
817 ; AVX512BW-NEXT: vzeroupper
818 ; AVX512BW-NEXT: retq
820 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_3:
821 ; AVX512BWVL: # BB#0:
822 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
823 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
824 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [10,10,11,11,2,2,3,3,8,8,9,9,10,10,11,11]
825 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
826 ; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
827 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
828 ; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
829 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
830 ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
831 ; AVX512BWVL-NEXT: vzeroupper
832 ; AVX512BWVL-NEXT: retq
833 %vec = load <32 x i8>, <32 x i8>* %L
834 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 3, i32 11, i32 19, i32 27>
835 store <4 x i8> %strided.vec, <4 x i8>* %S
839 define void @shuffle_v32i8_to_v4i8_4(<32 x i8>* %L, <4 x i8>* %S) nounwind {
840 ; AVX1-LABEL: shuffle_v32i8_to_v4i8_4:
842 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0
843 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
844 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
845 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
846 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
847 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
848 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
849 ; AVX1-NEXT: vzeroupper
852 ; AVX2-LABEL: shuffle_v32i8_to_v4i8_4:
854 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
855 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
856 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
857 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
858 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
859 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
860 ; AVX2-NEXT: vmovd %xmm0, (%rsi)
861 ; AVX2-NEXT: vzeroupper
864 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_4:
866 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
867 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
868 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
869 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
870 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
871 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
872 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
873 ; AVX512F-NEXT: vzeroupper
876 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_4:
878 ; AVX512VL-NEXT: vmovaps (%rdi), %ymm0
879 ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
880 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
881 ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
882 ; AVX512VL-NEXT: vzeroupper
883 ; AVX512VL-NEXT: retq
885 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_4:
887 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
888 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
889 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
890 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
891 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
892 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
893 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
894 ; AVX512BW-NEXT: vzeroupper
895 ; AVX512BW-NEXT: retq
897 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_4:
898 ; AVX512BWVL: # BB#0:
899 ; AVX512BWVL-NEXT: vmovaps (%rdi), %ymm0
900 ; AVX512BWVL-NEXT: vextractf128 $1, %ymm0, %xmm1
901 ; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
902 ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
903 ; AVX512BWVL-NEXT: vzeroupper
904 ; AVX512BWVL-NEXT: retq
905 %vec = load <32 x i8>, <32 x i8>* %L
906 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 4, i32 12, i32 20, i32 28>
907 store <4 x i8> %strided.vec, <4 x i8>* %S
911 define void @shuffle_v32i8_to_v4i8_5(<32 x i8>* %L, <4 x i8>* %S) nounwind {
912 ; AVX1-LABEL: shuffle_v32i8_to_v4i8_5:
914 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0
915 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
916 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
917 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
918 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
919 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
920 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
921 ; AVX1-NEXT: vzeroupper
924 ; AVX2-LABEL: shuffle_v32i8_to_v4i8_5:
926 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
927 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
928 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
929 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
930 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
931 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
932 ; AVX2-NEXT: vmovd %xmm0, (%rsi)
933 ; AVX2-NEXT: vzeroupper
936 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_5:
938 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
939 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
940 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
941 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
942 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
943 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
944 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
945 ; AVX512F-NEXT: vzeroupper
948 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_5:
950 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
951 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
952 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
953 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
954 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
955 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
956 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
957 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
958 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
959 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
960 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
961 ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
962 ; AVX512VL-NEXT: vzeroupper
963 ; AVX512VL-NEXT: retq
965 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_5:
967 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
968 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
969 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
970 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
971 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
972 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
973 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
974 ; AVX512BW-NEXT: vzeroupper
975 ; AVX512BW-NEXT: retq
977 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_5:
978 ; AVX512BWVL: # BB#0:
979 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
980 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
981 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
982 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
983 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
984 ; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
985 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
986 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
987 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
988 ; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
989 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
990 ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
991 ; AVX512BWVL-NEXT: vzeroupper
992 ; AVX512BWVL-NEXT: retq
993 %vec = load <32 x i8>, <32 x i8>* %L
994 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 5, i32 13, i32 21, i32 29>
995 store <4 x i8> %strided.vec, <4 x i8>* %S
999 define void @shuffle_v32i8_to_v4i8_6(<32 x i8>* %L, <4 x i8>* %S) nounwind {
1000 ; AVX1-LABEL: shuffle_v32i8_to_v4i8_6:
1002 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0
1003 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1004 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1005 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1006 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1007 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1008 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
1009 ; AVX1-NEXT: vzeroupper
1012 ; AVX2-LABEL: shuffle_v32i8_to_v4i8_6:
1014 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
1015 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1016 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1017 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1018 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1019 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1020 ; AVX2-NEXT: vmovd %xmm0, (%rsi)
1021 ; AVX2-NEXT: vzeroupper
1024 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_6:
1026 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
1027 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
1028 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1029 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1030 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1031 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1032 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
1033 ; AVX512F-NEXT: vzeroupper
1034 ; AVX512F-NEXT: retq
1036 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_6:
1038 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
1039 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
1040 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
1041 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
1042 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
1043 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
1044 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1045 ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
1046 ; AVX512VL-NEXT: vzeroupper
1047 ; AVX512VL-NEXT: retq
1049 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_6:
1051 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
1052 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
1053 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1054 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1055 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1056 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1057 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
1058 ; AVX512BW-NEXT: vzeroupper
1059 ; AVX512BW-NEXT: retq
1061 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_6:
1062 ; AVX512BWVL: # BB#0:
1063 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
1064 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
1065 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
1066 ; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
1067 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
1068 ; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
1069 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1070 ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
1071 ; AVX512BWVL-NEXT: vzeroupper
1072 ; AVX512BWVL-NEXT: retq
1073 %vec = load <32 x i8>, <32 x i8>* %L
1074 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 6, i32 14, i32 22, i32 30>
1075 store <4 x i8> %strided.vec, <4 x i8>* %S
1079 define void @shuffle_v32i8_to_v4i8_7(<32 x i8>* %L, <4 x i8>* %S) nounwind {
1080 ; AVX1-LABEL: shuffle_v32i8_to_v4i8_7:
1082 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0
1083 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1084 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1085 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1086 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1087 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1088 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
1089 ; AVX1-NEXT: vzeroupper
1092 ; AVX2-LABEL: shuffle_v32i8_to_v4i8_7:
1094 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
1095 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1096 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1097 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1098 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1099 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1100 ; AVX2-NEXT: vmovd %xmm0, (%rsi)
1101 ; AVX2-NEXT: vzeroupper
1104 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_7:
1106 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
1107 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
1108 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1109 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1110 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1111 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1112 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
1113 ; AVX512F-NEXT: vzeroupper
1114 ; AVX512F-NEXT: retq
1116 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_7:
1118 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
1119 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
1120 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,14,14,15,15,14,14,15,15,4,4,5,5,6,6]
1121 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1122 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1123 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1124 ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
1125 ; AVX512VL-NEXT: vzeroupper
1126 ; AVX512VL-NEXT: retq
1128 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_7:
1130 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
1131 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
1132 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1133 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1134 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1135 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1136 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
1137 ; AVX512BW-NEXT: vzeroupper
1138 ; AVX512BW-NEXT: retq
1140 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_7:
1141 ; AVX512BWVL: # BB#0:
1142 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
1143 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
1144 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,14,14,15,15,14,14,15,15,4,4,5,5,6,6]
1145 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1146 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1147 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1148 ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
1149 ; AVX512BWVL-NEXT: vzeroupper
1150 ; AVX512BWVL-NEXT: retq
1151 %vec = load <32 x i8>, <32 x i8>* %L
1152 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 7, i32 15, i32 23, i32 31>
1153 store <4 x i8> %strided.vec, <4 x i8>* %S