1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
12 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
14 define void @shuffle_v32i8_to_v16i8_1(ptr %L, ptr %S) nounwind {
15 ; AVX1-LABEL: shuffle_v32i8_to_v16i8_1:
17 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
18 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
19 ; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
20 ; AVX1-NEXT: # xmm2 = mem[0,0]
21 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
22 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
23 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
24 ; AVX1-NEXT: vmovdqa %xmm0, (%rsi)
27 ; AVX2-LABEL: shuffle_v32i8_to_v16i8_1:
29 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
30 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
31 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
32 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
33 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
34 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
35 ; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
38 ; AVX512F-LABEL: shuffle_v32i8_to_v16i8_1:
40 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
41 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
42 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
43 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
44 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
45 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
46 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
49 ; AVX512VL-LABEL: shuffle_v32i8_to_v16i8_1:
51 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
52 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
53 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
54 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
55 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
56 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
57 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
60 ; AVX512BW-LABEL: shuffle_v32i8_to_v16i8_1:
62 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
63 ; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0
64 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
65 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
66 ; AVX512BW-NEXT: vzeroupper
69 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8_1:
70 ; AVX512BWVL: # %bb.0:
71 ; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %ymm0
72 ; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi)
73 ; AVX512BWVL-NEXT: vzeroupper
74 ; AVX512BWVL-NEXT: retq
75 %vec = load <32 x i8>, ptr %L
76 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
77 store <16 x i8> %strided.vec, ptr %S
81 define void @shuffle_v16i16_to_v8i16_1(ptr %L, ptr %S) nounwind {
82 ; AVX-LABEL: shuffle_v16i16_to_v8i16_1:
84 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
85 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
86 ; AVX-NEXT: vpsrld $16, %xmm1, %xmm1
87 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
88 ; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
89 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
92 ; AVX512F-LABEL: shuffle_v16i16_to_v8i16_1:
94 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
95 ; AVX512F-NEXT: vpsrld $16, %ymm0, %ymm0
96 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
97 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
98 ; AVX512F-NEXT: vzeroupper
101 ; AVX512VL-LABEL: shuffle_v16i16_to_v8i16_1:
103 ; AVX512VL-NEXT: vpsrld $16, (%rdi), %ymm0
104 ; AVX512VL-NEXT: vpmovdw %ymm0, (%rsi)
105 ; AVX512VL-NEXT: vzeroupper
106 ; AVX512VL-NEXT: retq
108 ; AVX512BW-LABEL: shuffle_v16i16_to_v8i16_1:
110 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
111 ; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm0
112 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
113 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
114 ; AVX512BW-NEXT: vzeroupper
115 ; AVX512BW-NEXT: retq
117 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16_1:
118 ; AVX512BWVL: # %bb.0:
119 ; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %ymm0
120 ; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rsi)
121 ; AVX512BWVL-NEXT: vzeroupper
122 ; AVX512BWVL-NEXT: retq
123 %vec = load <16 x i16>, ptr %L
124 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
125 store <8 x i16> %strided.vec, ptr %S
129 define void @shuffle_v8i32_to_v4i32_1(ptr %L, ptr %S) nounwind {
130 ; AVX-LABEL: shuffle_v8i32_to_v4i32_1:
132 ; AVX-NEXT: vmovaps (%rdi), %xmm0
133 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3]
134 ; AVX-NEXT: vmovaps %xmm0, (%rsi)
137 ; AVX512-LABEL: shuffle_v8i32_to_v4i32_1:
139 ; AVX512-NEXT: vmovaps (%rdi), %xmm0
140 ; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3]
141 ; AVX512-NEXT: vmovaps %xmm0, (%rsi)
143 %vec = load <8 x i32>, ptr %L
144 %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
145 store <4 x i32> %strided.vec, ptr %S
149 define void @shuffle_v32i8_to_v8i8_1(ptr %L, ptr %S) nounwind {
150 ; AVX1-LABEL: shuffle_v32i8_to_v8i8_1:
152 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
153 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
154 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
155 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
156 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
157 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
158 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
161 ; AVX2-LABEL: shuffle_v32i8_to_v8i8_1:
163 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
164 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
165 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
166 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
167 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
168 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
169 ; AVX2-NEXT: vmovq %xmm0, (%rsi)
172 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_1:
174 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
175 ; AVX512F-NEXT: vpsrld $8, %ymm0, %ymm0
176 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
177 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
178 ; AVX512F-NEXT: vzeroupper
181 ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_1:
183 ; AVX512VL-NEXT: vpsrld $8, (%rdi), %ymm0
184 ; AVX512VL-NEXT: vpmovdb %ymm0, (%rsi)
185 ; AVX512VL-NEXT: vzeroupper
186 ; AVX512VL-NEXT: retq
188 ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_1:
190 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
191 ; AVX512BW-NEXT: vpsrld $8, %ymm0, %ymm0
192 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
193 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
194 ; AVX512BW-NEXT: vzeroupper
195 ; AVX512BW-NEXT: retq
197 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_1:
198 ; AVX512BWVL: # %bb.0:
199 ; AVX512BWVL-NEXT: vpsrld $8, (%rdi), %ymm0
200 ; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rsi)
201 ; AVX512BWVL-NEXT: vzeroupper
202 ; AVX512BWVL-NEXT: retq
203 %vec = load <32 x i8>, ptr %L
204 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
205 store <8 x i8> %strided.vec, ptr %S
209 define void @shuffle_v32i8_to_v8i8_2(ptr %L, ptr %S) nounwind {
210 ; AVX1-LABEL: shuffle_v32i8_to_v8i8_2:
212 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
213 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
214 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
215 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
216 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
217 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
218 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
221 ; AVX2-LABEL: shuffle_v32i8_to_v8i8_2:
223 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
224 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
225 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
226 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
227 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
228 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
229 ; AVX2-NEXT: vmovq %xmm0, (%rsi)
232 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_2:
234 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
235 ; AVX512F-NEXT: vpsrld $16, %ymm0, %ymm0
236 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
237 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
238 ; AVX512F-NEXT: vzeroupper
241 ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_2:
243 ; AVX512VL-NEXT: vpsrld $16, (%rdi), %ymm0
244 ; AVX512VL-NEXT: vpmovdb %ymm0, (%rsi)
245 ; AVX512VL-NEXT: vzeroupper
246 ; AVX512VL-NEXT: retq
248 ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_2:
250 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
251 ; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm0
252 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
253 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
254 ; AVX512BW-NEXT: vzeroupper
255 ; AVX512BW-NEXT: retq
257 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_2:
258 ; AVX512BWVL: # %bb.0:
259 ; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %ymm0
260 ; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rsi)
261 ; AVX512BWVL-NEXT: vzeroupper
262 ; AVX512BWVL-NEXT: retq
263 %vec = load <32 x i8>, ptr %L
264 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
265 store <8 x i8> %strided.vec, ptr %S
269 define void @shuffle_v32i8_to_v8i8_3(ptr %L, ptr %S) nounwind {
270 ; AVX1-LABEL: shuffle_v32i8_to_v8i8_3:
272 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
273 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
274 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
275 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
276 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
277 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
278 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
281 ; AVX2-LABEL: shuffle_v32i8_to_v8i8_3:
283 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
284 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
285 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
286 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
287 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
288 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
289 ; AVX2-NEXT: vmovq %xmm0, (%rsi)
292 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_3:
294 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
295 ; AVX512F-NEXT: vpsrld $24, %ymm0, %ymm0
296 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
297 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
298 ; AVX512F-NEXT: vzeroupper
301 ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_3:
303 ; AVX512VL-NEXT: vpsrld $24, (%rdi), %ymm0
304 ; AVX512VL-NEXT: vpmovdb %ymm0, (%rsi)
305 ; AVX512VL-NEXT: vzeroupper
306 ; AVX512VL-NEXT: retq
308 ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_3:
310 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
311 ; AVX512BW-NEXT: vpsrld $24, %ymm0, %ymm0
312 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
313 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
314 ; AVX512BW-NEXT: vzeroupper
315 ; AVX512BW-NEXT: retq
317 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_3:
318 ; AVX512BWVL: # %bb.0:
319 ; AVX512BWVL-NEXT: vpsrld $24, (%rdi), %ymm0
320 ; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rsi)
321 ; AVX512BWVL-NEXT: vzeroupper
322 ; AVX512BWVL-NEXT: retq
323 %vec = load <32 x i8>, ptr %L
324 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
325 store <8 x i8> %strided.vec, ptr %S
329 define void @shuffle_v16i16_to_v4i16_1(ptr %L, ptr %S) nounwind {
330 ; AVX1-LABEL: shuffle_v16i16_to_v4i16_1:
332 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
333 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
334 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
335 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
336 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
337 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
340 ; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_1:
341 ; AVX2-SLOW: # %bb.0:
342 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
343 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
344 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
345 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
346 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
347 ; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi)
348 ; AVX2-SLOW-NEXT: retq
350 ; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_1:
351 ; AVX2-FAST: # %bb.0:
352 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
353 ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
354 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
355 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
356 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
357 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
358 ; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi)
359 ; AVX2-FAST-NEXT: retq
361 ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_1:
363 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
364 ; AVX512F-NEXT: vpsrlq $16, %ymm0, %ymm0
365 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
366 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
367 ; AVX512F-NEXT: vzeroupper
370 ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_1:
372 ; AVX512VL-NEXT: vpsrlq $16, (%rdi), %ymm0
373 ; AVX512VL-NEXT: vpmovqw %ymm0, (%rsi)
374 ; AVX512VL-NEXT: vzeroupper
375 ; AVX512VL-NEXT: retq
377 ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_1:
379 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
380 ; AVX512BW-NEXT: vpsrlq $16, %ymm0, %ymm0
381 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
382 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
383 ; AVX512BW-NEXT: vzeroupper
384 ; AVX512BW-NEXT: retq
386 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_1:
387 ; AVX512BWVL: # %bb.0:
388 ; AVX512BWVL-NEXT: vpsrlq $16, (%rdi), %ymm0
389 ; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rsi)
390 ; AVX512BWVL-NEXT: vzeroupper
391 ; AVX512BWVL-NEXT: retq
392 %vec = load <16 x i16>, ptr %L
393 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
394 store <4 x i16> %strided.vec, ptr %S
398 define void @shuffle_v16i16_to_v4i16_2(ptr %L, ptr %S) nounwind {
399 ; AVX1-LABEL: shuffle_v16i16_to_v4i16_2:
401 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
402 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
403 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
404 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
405 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
406 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
409 ; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_2:
410 ; AVX2-SLOW: # %bb.0:
411 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
412 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
413 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
414 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
415 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
416 ; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi)
417 ; AVX2-SLOW-NEXT: retq
419 ; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_2:
420 ; AVX2-FAST: # %bb.0:
421 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
422 ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
423 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
424 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
425 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
426 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
427 ; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi)
428 ; AVX2-FAST-NEXT: retq
430 ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_2:
432 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
433 ; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm0
434 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
435 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
436 ; AVX512F-NEXT: vzeroupper
439 ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_2:
441 ; AVX512VL-NEXT: vpsrlq $32, (%rdi), %ymm0
442 ; AVX512VL-NEXT: vpmovqw %ymm0, (%rsi)
443 ; AVX512VL-NEXT: vzeroupper
444 ; AVX512VL-NEXT: retq
446 ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_2:
448 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
449 ; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm0
450 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
451 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
452 ; AVX512BW-NEXT: vzeroupper
453 ; AVX512BW-NEXT: retq
455 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_2:
456 ; AVX512BWVL: # %bb.0:
457 ; AVX512BWVL-NEXT: vpsrlq $32, (%rdi), %ymm0
458 ; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rsi)
459 ; AVX512BWVL-NEXT: vzeroupper
460 ; AVX512BWVL-NEXT: retq
461 %vec = load <16 x i16>, ptr %L
462 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
463 store <4 x i16> %strided.vec, ptr %S
467 define void @shuffle_v16i16_to_v4i16_3(ptr %L, ptr %S) nounwind {
468 ; AVX1-LABEL: shuffle_v16i16_to_v4i16_3:
470 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
471 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
472 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
473 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
474 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
475 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
478 ; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_3:
479 ; AVX2-SLOW: # %bb.0:
480 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
481 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
482 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
483 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
484 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
485 ; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi)
486 ; AVX2-SLOW-NEXT: retq
488 ; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_3:
489 ; AVX2-FAST: # %bb.0:
490 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
491 ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
492 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
493 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
494 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
495 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
496 ; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi)
497 ; AVX2-FAST-NEXT: retq
499 ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_3:
501 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
502 ; AVX512F-NEXT: vpsrlq $48, %ymm0, %ymm0
503 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
504 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
505 ; AVX512F-NEXT: vzeroupper
508 ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_3:
510 ; AVX512VL-NEXT: vpsrlq $48, (%rdi), %ymm0
511 ; AVX512VL-NEXT: vpmovqw %ymm0, (%rsi)
512 ; AVX512VL-NEXT: vzeroupper
513 ; AVX512VL-NEXT: retq
515 ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_3:
517 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
518 ; AVX512BW-NEXT: vpsrlq $48, %ymm0, %ymm0
519 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
520 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
521 ; AVX512BW-NEXT: vzeroupper
522 ; AVX512BW-NEXT: retq
524 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_3:
525 ; AVX512BWVL: # %bb.0:
526 ; AVX512BWVL-NEXT: vpsrlq $48, (%rdi), %ymm0
527 ; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rsi)
528 ; AVX512BWVL-NEXT: vzeroupper
529 ; AVX512BWVL-NEXT: retq
530 %vec = load <16 x i16>, ptr %L
531 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
532 store <4 x i16> %strided.vec, ptr %S
536 define void @shuffle_v32i8_to_v4i8_1(ptr %L, ptr %S) nounwind {
537 ; AVX1-LABEL: shuffle_v32i8_to_v4i8_1:
539 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
540 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
541 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
542 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
543 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
544 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
545 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
548 ; AVX2-LABEL: shuffle_v32i8_to_v4i8_1:
550 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
551 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
552 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
553 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
554 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
555 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
556 ; AVX2-NEXT: vmovd %xmm0, (%rsi)
559 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_1:
561 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
562 ; AVX512F-NEXT: vpsrlq $8, %ymm0, %ymm0
563 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
564 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
565 ; AVX512F-NEXT: vzeroupper
568 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_1:
570 ; AVX512VL-NEXT: vpsrlq $8, (%rdi), %ymm0
571 ; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi)
572 ; AVX512VL-NEXT: vzeroupper
573 ; AVX512VL-NEXT: retq
575 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_1:
577 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
578 ; AVX512BW-NEXT: vpsrlq $8, %ymm0, %ymm0
579 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
580 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
581 ; AVX512BW-NEXT: vzeroupper
582 ; AVX512BW-NEXT: retq
584 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_1:
585 ; AVX512BWVL: # %bb.0:
586 ; AVX512BWVL-NEXT: vpsrlq $8, (%rdi), %ymm0
587 ; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi)
588 ; AVX512BWVL-NEXT: vzeroupper
589 ; AVX512BWVL-NEXT: retq
590 %vec = load <32 x i8>, ptr %L
591 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 1, i32 9, i32 17, i32 25>
592 store <4 x i8> %strided.vec, ptr %S
596 define void @shuffle_v32i8_to_v4i8_2(ptr %L, ptr %S) nounwind {
597 ; AVX1-LABEL: shuffle_v32i8_to_v4i8_2:
599 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
600 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
601 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
602 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
603 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
604 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
605 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
608 ; AVX2-LABEL: shuffle_v32i8_to_v4i8_2:
610 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
611 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
612 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
613 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
614 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
615 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
616 ; AVX2-NEXT: vmovd %xmm0, (%rsi)
619 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_2:
621 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
622 ; AVX512F-NEXT: vpsrlq $16, %ymm0, %ymm0
623 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
624 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
625 ; AVX512F-NEXT: vzeroupper
628 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_2:
630 ; AVX512VL-NEXT: vpsrlq $16, (%rdi), %ymm0
631 ; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi)
632 ; AVX512VL-NEXT: vzeroupper
633 ; AVX512VL-NEXT: retq
635 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_2:
637 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
638 ; AVX512BW-NEXT: vpsrlq $16, %ymm0, %ymm0
639 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
640 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
641 ; AVX512BW-NEXT: vzeroupper
642 ; AVX512BW-NEXT: retq
644 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_2:
645 ; AVX512BWVL: # %bb.0:
646 ; AVX512BWVL-NEXT: vpsrlq $16, (%rdi), %ymm0
647 ; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi)
648 ; AVX512BWVL-NEXT: vzeroupper
649 ; AVX512BWVL-NEXT: retq
650 %vec = load <32 x i8>, ptr %L
651 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 2, i32 10, i32 18, i32 26>
652 store <4 x i8> %strided.vec, ptr %S
656 define void @shuffle_v32i8_to_v4i8_3(ptr %L, ptr %S) nounwind {
657 ; AVX1-LABEL: shuffle_v32i8_to_v4i8_3:
659 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
660 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
661 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
662 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
663 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
664 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
665 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
668 ; AVX2-LABEL: shuffle_v32i8_to_v4i8_3:
670 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
671 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
672 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
673 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
674 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
675 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
676 ; AVX2-NEXT: vmovd %xmm0, (%rsi)
679 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_3:
681 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
682 ; AVX512F-NEXT: vpsrlq $24, %ymm0, %ymm0
683 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
684 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
685 ; AVX512F-NEXT: vzeroupper
688 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_3:
690 ; AVX512VL-NEXT: vpsrlq $24, (%rdi), %ymm0
691 ; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi)
692 ; AVX512VL-NEXT: vzeroupper
693 ; AVX512VL-NEXT: retq
695 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_3:
697 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
698 ; AVX512BW-NEXT: vpsrlq $24, %ymm0, %ymm0
699 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
700 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
701 ; AVX512BW-NEXT: vzeroupper
702 ; AVX512BW-NEXT: retq
704 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_3:
705 ; AVX512BWVL: # %bb.0:
706 ; AVX512BWVL-NEXT: vpsrlq $24, (%rdi), %ymm0
707 ; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi)
708 ; AVX512BWVL-NEXT: vzeroupper
709 ; AVX512BWVL-NEXT: retq
710 %vec = load <32 x i8>, ptr %L
711 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 3, i32 11, i32 19, i32 27>
712 store <4 x i8> %strided.vec, ptr %S
716 define void @shuffle_v32i8_to_v4i8_4(ptr %L, ptr %S) nounwind {
717 ; AVX1-LABEL: shuffle_v32i8_to_v4i8_4:
719 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
720 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
721 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
722 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
723 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
724 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
725 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
728 ; AVX2-LABEL: shuffle_v32i8_to_v4i8_4:
730 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
731 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
732 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
733 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
734 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
735 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
736 ; AVX2-NEXT: vmovd %xmm0, (%rsi)
739 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_4:
741 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
742 ; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm0
743 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
744 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
745 ; AVX512F-NEXT: vzeroupper
748 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_4:
750 ; AVX512VL-NEXT: vpsrlq $32, (%rdi), %ymm0
751 ; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi)
752 ; AVX512VL-NEXT: vzeroupper
753 ; AVX512VL-NEXT: retq
755 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_4:
757 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
758 ; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm0
759 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
760 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
761 ; AVX512BW-NEXT: vzeroupper
762 ; AVX512BW-NEXT: retq
764 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_4:
765 ; AVX512BWVL: # %bb.0:
766 ; AVX512BWVL-NEXT: vpsrlq $32, (%rdi), %ymm0
767 ; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi)
768 ; AVX512BWVL-NEXT: vzeroupper
769 ; AVX512BWVL-NEXT: retq
770 %vec = load <32 x i8>, ptr %L
771 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 4, i32 12, i32 20, i32 28>
772 store <4 x i8> %strided.vec, ptr %S
776 define void @shuffle_v32i8_to_v4i8_5(ptr %L, ptr %S) nounwind {
777 ; AVX1-LABEL: shuffle_v32i8_to_v4i8_5:
779 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
780 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
781 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
782 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
783 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
784 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
785 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
788 ; AVX2-LABEL: shuffle_v32i8_to_v4i8_5:
790 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
791 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
792 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
793 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
794 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
795 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
796 ; AVX2-NEXT: vmovd %xmm0, (%rsi)
799 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_5:
801 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
802 ; AVX512F-NEXT: vpsrlq $40, %ymm0, %ymm0
803 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
804 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
805 ; AVX512F-NEXT: vzeroupper
808 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_5:
810 ; AVX512VL-NEXT: vpsrlq $40, (%rdi), %ymm0
811 ; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi)
812 ; AVX512VL-NEXT: vzeroupper
813 ; AVX512VL-NEXT: retq
815 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_5:
817 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
818 ; AVX512BW-NEXT: vpsrlq $40, %ymm0, %ymm0
819 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
820 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
821 ; AVX512BW-NEXT: vzeroupper
822 ; AVX512BW-NEXT: retq
824 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_5:
825 ; AVX512BWVL: # %bb.0:
826 ; AVX512BWVL-NEXT: vpsrlq $40, (%rdi), %ymm0
827 ; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi)
828 ; AVX512BWVL-NEXT: vzeroupper
829 ; AVX512BWVL-NEXT: retq
830 %vec = load <32 x i8>, ptr %L
831 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 5, i32 13, i32 21, i32 29>
832 store <4 x i8> %strided.vec, ptr %S
836 define void @shuffle_v32i8_to_v4i8_6(ptr %L, ptr %S) nounwind {
837 ; AVX1-LABEL: shuffle_v32i8_to_v4i8_6:
839 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
840 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
841 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
842 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
843 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
844 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
845 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
848 ; AVX2-LABEL: shuffle_v32i8_to_v4i8_6:
850 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
851 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
852 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
853 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
854 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
855 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
856 ; AVX2-NEXT: vmovd %xmm0, (%rsi)
859 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_6:
861 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
862 ; AVX512F-NEXT: vpsrlq $48, %ymm0, %ymm0
863 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
864 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
865 ; AVX512F-NEXT: vzeroupper
868 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_6:
870 ; AVX512VL-NEXT: vpsrlq $48, (%rdi), %ymm0
871 ; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi)
872 ; AVX512VL-NEXT: vzeroupper
873 ; AVX512VL-NEXT: retq
875 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_6:
877 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
878 ; AVX512BW-NEXT: vpsrlq $48, %ymm0, %ymm0
879 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
880 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
881 ; AVX512BW-NEXT: vzeroupper
882 ; AVX512BW-NEXT: retq
884 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_6:
885 ; AVX512BWVL: # %bb.0:
886 ; AVX512BWVL-NEXT: vpsrlq $48, (%rdi), %ymm0
887 ; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi)
888 ; AVX512BWVL-NEXT: vzeroupper
889 ; AVX512BWVL-NEXT: retq
890 %vec = load <32 x i8>, ptr %L
891 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 6, i32 14, i32 22, i32 30>
892 store <4 x i8> %strided.vec, ptr %S
896 define void @shuffle_v32i8_to_v4i8_7(ptr %L, ptr %S) nounwind {
897 ; AVX1-LABEL: shuffle_v32i8_to_v4i8_7:
899 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
900 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
901 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
902 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
903 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
904 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
905 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
908 ; AVX2-LABEL: shuffle_v32i8_to_v4i8_7:
910 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
911 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
912 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
913 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
914 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
915 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
916 ; AVX2-NEXT: vmovd %xmm0, (%rsi)
919 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_7:
921 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
922 ; AVX512F-NEXT: vpsrlq $56, %ymm0, %ymm0
923 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
924 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
925 ; AVX512F-NEXT: vzeroupper
928 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_7:
930 ; AVX512VL-NEXT: vpsrlq $56, (%rdi), %ymm0
931 ; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi)
932 ; AVX512VL-NEXT: vzeroupper
933 ; AVX512VL-NEXT: retq
935 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_7:
937 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
938 ; AVX512BW-NEXT: vpsrlq $56, %ymm0, %ymm0
939 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
940 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
941 ; AVX512BW-NEXT: vzeroupper
942 ; AVX512BW-NEXT: retq
944 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_7:
945 ; AVX512BWVL: # %bb.0:
946 ; AVX512BWVL-NEXT: vpsrlq $56, (%rdi), %ymm0
947 ; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi)
948 ; AVX512BWVL-NEXT: vzeroupper
949 ; AVX512BWVL-NEXT: retq
950 %vec = load <32 x i8>, ptr %L
951 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 7, i32 15, i32 23, i32 31>
952 store <4 x i8> %strided.vec, ptr %S