1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
12 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
14 define void @shuffle_v32i8_to_v16i8_1(ptr %L, ptr %S) nounwind {
15 ; AVX-LABEL: shuffle_v32i8_to_v16i8_1:
17 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
18 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
19 ; AVX-NEXT: vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
20 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
21 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
22 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
23 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
26 ; AVX512F-LABEL: shuffle_v32i8_to_v16i8_1:
28 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
29 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
30 ; AVX512F-NEXT: vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
31 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
32 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
33 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
34 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
37 ; AVX512VL-LABEL: shuffle_v32i8_to_v16i8_1:
39 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
40 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
41 ; AVX512VL-NEXT: vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
42 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
43 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
44 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
45 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
48 ; AVX512BW-LABEL: shuffle_v32i8_to_v16i8_1:
50 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
51 ; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0
52 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
53 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
54 ; AVX512BW-NEXT: vzeroupper
57 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8_1:
58 ; AVX512BWVL: # %bb.0:
59 ; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %ymm0
60 ; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi)
61 ; AVX512BWVL-NEXT: vzeroupper
62 ; AVX512BWVL-NEXT: retq
63 %vec = load <32 x i8>, ptr %L
64 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
65 store <16 x i8> %strided.vec, ptr %S
69 define void @shuffle_v16i16_to_v8i16_1(ptr %L, ptr %S) nounwind {
70 ; AVX-LABEL: shuffle_v16i16_to_v8i16_1:
72 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
73 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
74 ; AVX-NEXT: vpsrld $16, %xmm1, %xmm1
75 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
76 ; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
77 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
80 ; AVX512F-LABEL: shuffle_v16i16_to_v8i16_1:
82 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
83 ; AVX512F-NEXT: vpsrld $16, %ymm0, %ymm0
84 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
85 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
86 ; AVX512F-NEXT: vzeroupper
89 ; AVX512VL-LABEL: shuffle_v16i16_to_v8i16_1:
91 ; AVX512VL-NEXT: vpsrld $16, (%rdi), %ymm0
92 ; AVX512VL-NEXT: vpmovdw %ymm0, (%rsi)
93 ; AVX512VL-NEXT: vzeroupper
96 ; AVX512BW-LABEL: shuffle_v16i16_to_v8i16_1:
98 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
99 ; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm0
100 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
101 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
102 ; AVX512BW-NEXT: vzeroupper
103 ; AVX512BW-NEXT: retq
105 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16_1:
106 ; AVX512BWVL: # %bb.0:
107 ; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %ymm0
108 ; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rsi)
109 ; AVX512BWVL-NEXT: vzeroupper
110 ; AVX512BWVL-NEXT: retq
111 %vec = load <16 x i16>, ptr %L
112 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
113 store <8 x i16> %strided.vec, ptr %S
117 define void @shuffle_v8i32_to_v4i32_1(ptr %L, ptr %S) nounwind {
118 ; AVX-LABEL: shuffle_v8i32_to_v4i32_1:
120 ; AVX-NEXT: vmovaps (%rdi), %xmm0
121 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3]
122 ; AVX-NEXT: vmovaps %xmm0, (%rsi)
125 ; AVX512-LABEL: shuffle_v8i32_to_v4i32_1:
127 ; AVX512-NEXT: vmovaps (%rdi), %xmm0
128 ; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3]
129 ; AVX512-NEXT: vmovaps %xmm0, (%rsi)
131 %vec = load <8 x i32>, ptr %L
132 %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
133 store <4 x i32> %strided.vec, ptr %S
137 define void @shuffle_v32i8_to_v8i8_1(ptr %L, ptr %S) nounwind {
138 ; AVX-LABEL: shuffle_v32i8_to_v8i8_1:
140 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
141 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
142 ; AVX-NEXT: vmovd {{.*#+}} xmm2 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
143 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
144 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
145 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
146 ; AVX-NEXT: vmovq %xmm0, (%rsi)
149 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_1:
151 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
152 ; AVX512F-NEXT: vpsrld $8, %ymm0, %ymm0
153 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
154 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
155 ; AVX512F-NEXT: vzeroupper
158 ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_1:
160 ; AVX512VL-NEXT: vpsrld $8, (%rdi), %ymm0
161 ; AVX512VL-NEXT: vpmovdb %ymm0, (%rsi)
162 ; AVX512VL-NEXT: vzeroupper
163 ; AVX512VL-NEXT: retq
165 ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_1:
167 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
168 ; AVX512BW-NEXT: vpsrld $8, %ymm0, %ymm0
169 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
170 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
171 ; AVX512BW-NEXT: vzeroupper
172 ; AVX512BW-NEXT: retq
174 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_1:
175 ; AVX512BWVL: # %bb.0:
176 ; AVX512BWVL-NEXT: vpsrld $8, (%rdi), %ymm0
177 ; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rsi)
178 ; AVX512BWVL-NEXT: vzeroupper
179 ; AVX512BWVL-NEXT: retq
180 %vec = load <32 x i8>, ptr %L
181 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
182 store <8 x i8> %strided.vec, ptr %S
186 define void @shuffle_v32i8_to_v8i8_2(ptr %L, ptr %S) nounwind {
187 ; AVX-LABEL: shuffle_v32i8_to_v8i8_2:
189 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
190 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
191 ; AVX-NEXT: vmovd {{.*#+}} xmm2 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
192 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
193 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
194 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
195 ; AVX-NEXT: vmovq %xmm0, (%rsi)
198 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_2:
200 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
201 ; AVX512F-NEXT: vpsrld $16, %ymm0, %ymm0
202 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
203 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
204 ; AVX512F-NEXT: vzeroupper
207 ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_2:
209 ; AVX512VL-NEXT: vpsrld $16, (%rdi), %ymm0
210 ; AVX512VL-NEXT: vpmovdb %ymm0, (%rsi)
211 ; AVX512VL-NEXT: vzeroupper
212 ; AVX512VL-NEXT: retq
214 ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_2:
216 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
217 ; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm0
218 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
219 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
220 ; AVX512BW-NEXT: vzeroupper
221 ; AVX512BW-NEXT: retq
223 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_2:
224 ; AVX512BWVL: # %bb.0:
225 ; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %ymm0
226 ; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rsi)
227 ; AVX512BWVL-NEXT: vzeroupper
228 ; AVX512BWVL-NEXT: retq
229 %vec = load <32 x i8>, ptr %L
230 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
231 store <8 x i8> %strided.vec, ptr %S
235 define void @shuffle_v32i8_to_v8i8_3(ptr %L, ptr %S) nounwind {
236 ; AVX-LABEL: shuffle_v32i8_to_v8i8_3:
238 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
239 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
240 ; AVX-NEXT: vmovd {{.*#+}} xmm2 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
241 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
242 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
243 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
244 ; AVX-NEXT: vmovq %xmm0, (%rsi)
247 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_3:
249 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
250 ; AVX512F-NEXT: vpsrld $24, %ymm0, %ymm0
251 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
252 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
253 ; AVX512F-NEXT: vzeroupper
256 ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_3:
258 ; AVX512VL-NEXT: vpsrld $24, (%rdi), %ymm0
259 ; AVX512VL-NEXT: vpmovdb %ymm0, (%rsi)
260 ; AVX512VL-NEXT: vzeroupper
261 ; AVX512VL-NEXT: retq
263 ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_3:
265 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
266 ; AVX512BW-NEXT: vpsrld $24, %ymm0, %ymm0
267 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
268 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
269 ; AVX512BW-NEXT: vzeroupper
270 ; AVX512BW-NEXT: retq
272 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_3:
273 ; AVX512BWVL: # %bb.0:
274 ; AVX512BWVL-NEXT: vpsrld $24, (%rdi), %ymm0
275 ; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rsi)
276 ; AVX512BWVL-NEXT: vzeroupper
277 ; AVX512BWVL-NEXT: retq
278 %vec = load <32 x i8>, ptr %L
279 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
280 store <8 x i8> %strided.vec, ptr %S
284 define void @shuffle_v16i16_to_v4i16_1(ptr %L, ptr %S) nounwind {
285 ; AVX1-LABEL: shuffle_v16i16_to_v4i16_1:
287 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
288 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
289 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
290 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
291 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
292 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
295 ; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_1:
296 ; AVX2-SLOW: # %bb.0:
297 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
298 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
299 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
300 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
301 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
302 ; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi)
303 ; AVX2-SLOW-NEXT: retq
305 ; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_1:
306 ; AVX2-FAST: # %bb.0:
307 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
308 ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
309 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
310 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
311 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
312 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
313 ; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi)
314 ; AVX2-FAST-NEXT: retq
316 ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_1:
318 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
319 ; AVX512F-NEXT: vpsrlq $16, %ymm0, %ymm0
320 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
321 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
322 ; AVX512F-NEXT: vzeroupper
325 ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_1:
327 ; AVX512VL-NEXT: vpsrlq $16, (%rdi), %ymm0
328 ; AVX512VL-NEXT: vpmovqw %ymm0, (%rsi)
329 ; AVX512VL-NEXT: vzeroupper
330 ; AVX512VL-NEXT: retq
332 ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_1:
334 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
335 ; AVX512BW-NEXT: vpsrlq $16, %ymm0, %ymm0
336 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
337 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
338 ; AVX512BW-NEXT: vzeroupper
339 ; AVX512BW-NEXT: retq
341 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_1:
342 ; AVX512BWVL: # %bb.0:
343 ; AVX512BWVL-NEXT: vpsrlq $16, (%rdi), %ymm0
344 ; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rsi)
345 ; AVX512BWVL-NEXT: vzeroupper
346 ; AVX512BWVL-NEXT: retq
347 %vec = load <16 x i16>, ptr %L
348 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
349 store <4 x i16> %strided.vec, ptr %S
353 define void @shuffle_v16i16_to_v4i16_2(ptr %L, ptr %S) nounwind {
354 ; AVX1-LABEL: shuffle_v16i16_to_v4i16_2:
356 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
357 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
358 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
359 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
360 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
361 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
364 ; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_2:
365 ; AVX2-SLOW: # %bb.0:
366 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
367 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
368 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
369 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
370 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
371 ; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi)
372 ; AVX2-SLOW-NEXT: retq
374 ; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_2:
375 ; AVX2-FAST: # %bb.0:
376 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
377 ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
378 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
379 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
380 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
381 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
382 ; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi)
383 ; AVX2-FAST-NEXT: retq
385 ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_2:
387 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
388 ; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm0
389 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
390 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
391 ; AVX512F-NEXT: vzeroupper
394 ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_2:
396 ; AVX512VL-NEXT: vpsrlq $32, (%rdi), %ymm0
397 ; AVX512VL-NEXT: vpmovqw %ymm0, (%rsi)
398 ; AVX512VL-NEXT: vzeroupper
399 ; AVX512VL-NEXT: retq
401 ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_2:
403 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
404 ; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm0
405 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
406 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
407 ; AVX512BW-NEXT: vzeroupper
408 ; AVX512BW-NEXT: retq
410 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_2:
411 ; AVX512BWVL: # %bb.0:
412 ; AVX512BWVL-NEXT: vpsrlq $32, (%rdi), %ymm0
413 ; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rsi)
414 ; AVX512BWVL-NEXT: vzeroupper
415 ; AVX512BWVL-NEXT: retq
416 %vec = load <16 x i16>, ptr %L
417 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
418 store <4 x i16> %strided.vec, ptr %S
422 define void @shuffle_v16i16_to_v4i16_3(ptr %L, ptr %S) nounwind {
423 ; AVX1-LABEL: shuffle_v16i16_to_v4i16_3:
425 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
426 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
427 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
428 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
429 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
430 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
433 ; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_3:
434 ; AVX2-SLOW: # %bb.0:
435 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
436 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
437 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
438 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
439 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
440 ; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi)
441 ; AVX2-SLOW-NEXT: retq
443 ; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_3:
444 ; AVX2-FAST: # %bb.0:
445 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
446 ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
447 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
448 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
449 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
450 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
451 ; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi)
452 ; AVX2-FAST-NEXT: retq
454 ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_3:
456 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
457 ; AVX512F-NEXT: vpsrlq $48, %ymm0, %ymm0
458 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
459 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
460 ; AVX512F-NEXT: vzeroupper
463 ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_3:
465 ; AVX512VL-NEXT: vpsrlq $48, (%rdi), %ymm0
466 ; AVX512VL-NEXT: vpmovqw %ymm0, (%rsi)
467 ; AVX512VL-NEXT: vzeroupper
468 ; AVX512VL-NEXT: retq
470 ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_3:
472 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
473 ; AVX512BW-NEXT: vpsrlq $48, %ymm0, %ymm0
474 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
475 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
476 ; AVX512BW-NEXT: vzeroupper
477 ; AVX512BW-NEXT: retq
479 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_3:
480 ; AVX512BWVL: # %bb.0:
481 ; AVX512BWVL-NEXT: vpsrlq $48, (%rdi), %ymm0
482 ; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rsi)
483 ; AVX512BWVL-NEXT: vzeroupper
484 ; AVX512BWVL-NEXT: retq
485 %vec = load <16 x i16>, ptr %L
486 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
487 store <4 x i16> %strided.vec, ptr %S
491 define void @shuffle_v32i8_to_v4i8_1(ptr %L, ptr %S) nounwind {
492 ; AVX1-LABEL: shuffle_v32i8_to_v4i8_1:
494 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
495 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
496 ; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
497 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
498 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
499 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
500 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
503 ; AVX2-LABEL: shuffle_v32i8_to_v4i8_1:
505 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
506 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
507 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
508 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
509 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
510 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
511 ; AVX2-NEXT: vmovd %xmm0, (%rsi)
514 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_1:
516 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
517 ; AVX512F-NEXT: vpsrlq $8, %ymm0, %ymm0
518 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
519 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
520 ; AVX512F-NEXT: vzeroupper
523 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_1:
525 ; AVX512VL-NEXT: vpsrlq $8, (%rdi), %ymm0
526 ; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi)
527 ; AVX512VL-NEXT: vzeroupper
528 ; AVX512VL-NEXT: retq
530 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_1:
532 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
533 ; AVX512BW-NEXT: vpsrlq $8, %ymm0, %ymm0
534 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
535 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
536 ; AVX512BW-NEXT: vzeroupper
537 ; AVX512BW-NEXT: retq
539 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_1:
540 ; AVX512BWVL: # %bb.0:
541 ; AVX512BWVL-NEXT: vpsrlq $8, (%rdi), %ymm0
542 ; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi)
543 ; AVX512BWVL-NEXT: vzeroupper
544 ; AVX512BWVL-NEXT: retq
545 %vec = load <32 x i8>, ptr %L
546 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 1, i32 9, i32 17, i32 25>
547 store <4 x i8> %strided.vec, ptr %S
551 define void @shuffle_v32i8_to_v4i8_2(ptr %L, ptr %S) nounwind {
552 ; AVX1-LABEL: shuffle_v32i8_to_v4i8_2:
554 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
555 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
556 ; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
557 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
558 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
559 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
560 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
563 ; AVX2-LABEL: shuffle_v32i8_to_v4i8_2:
565 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
566 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
567 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
568 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
569 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
570 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
571 ; AVX2-NEXT: vmovd %xmm0, (%rsi)
574 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_2:
576 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
577 ; AVX512F-NEXT: vpsrlq $16, %ymm0, %ymm0
578 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
579 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
580 ; AVX512F-NEXT: vzeroupper
583 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_2:
585 ; AVX512VL-NEXT: vpsrlq $16, (%rdi), %ymm0
586 ; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi)
587 ; AVX512VL-NEXT: vzeroupper
588 ; AVX512VL-NEXT: retq
590 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_2:
592 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
593 ; AVX512BW-NEXT: vpsrlq $16, %ymm0, %ymm0
594 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
595 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
596 ; AVX512BW-NEXT: vzeroupper
597 ; AVX512BW-NEXT: retq
599 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_2:
600 ; AVX512BWVL: # %bb.0:
601 ; AVX512BWVL-NEXT: vpsrlq $16, (%rdi), %ymm0
602 ; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi)
603 ; AVX512BWVL-NEXT: vzeroupper
604 ; AVX512BWVL-NEXT: retq
605 %vec = load <32 x i8>, ptr %L
606 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 2, i32 10, i32 18, i32 26>
607 store <4 x i8> %strided.vec, ptr %S
611 define void @shuffle_v32i8_to_v4i8_3(ptr %L, ptr %S) nounwind {
612 ; AVX1-LABEL: shuffle_v32i8_to_v4i8_3:
614 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
615 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
616 ; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
617 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
618 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
619 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
620 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
623 ; AVX2-LABEL: shuffle_v32i8_to_v4i8_3:
625 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
626 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
627 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
628 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
629 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
630 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
631 ; AVX2-NEXT: vmovd %xmm0, (%rsi)
634 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_3:
636 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
637 ; AVX512F-NEXT: vpsrlq $24, %ymm0, %ymm0
638 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
639 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
640 ; AVX512F-NEXT: vzeroupper
643 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_3:
645 ; AVX512VL-NEXT: vpsrlq $24, (%rdi), %ymm0
646 ; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi)
647 ; AVX512VL-NEXT: vzeroupper
648 ; AVX512VL-NEXT: retq
650 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_3:
652 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
653 ; AVX512BW-NEXT: vpsrlq $24, %ymm0, %ymm0
654 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
655 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
656 ; AVX512BW-NEXT: vzeroupper
657 ; AVX512BW-NEXT: retq
659 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_3:
660 ; AVX512BWVL: # %bb.0:
661 ; AVX512BWVL-NEXT: vpsrlq $24, (%rdi), %ymm0
662 ; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi)
663 ; AVX512BWVL-NEXT: vzeroupper
664 ; AVX512BWVL-NEXT: retq
665 %vec = load <32 x i8>, ptr %L
666 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 3, i32 11, i32 19, i32 27>
667 store <4 x i8> %strided.vec, ptr %S
671 define void @shuffle_v32i8_to_v4i8_4(ptr %L, ptr %S) nounwind {
672 ; AVX1-LABEL: shuffle_v32i8_to_v4i8_4:
674 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
675 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
676 ; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
677 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
678 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
679 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
680 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
683 ; AVX2-LABEL: shuffle_v32i8_to_v4i8_4:
685 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
686 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
687 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
688 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
689 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
690 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
691 ; AVX2-NEXT: vmovd %xmm0, (%rsi)
694 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_4:
696 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
697 ; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm0
698 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
699 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
700 ; AVX512F-NEXT: vzeroupper
703 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_4:
705 ; AVX512VL-NEXT: vpsrlq $32, (%rdi), %ymm0
706 ; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi)
707 ; AVX512VL-NEXT: vzeroupper
708 ; AVX512VL-NEXT: retq
710 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_4:
712 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
713 ; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm0
714 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
715 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
716 ; AVX512BW-NEXT: vzeroupper
717 ; AVX512BW-NEXT: retq
719 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_4:
720 ; AVX512BWVL: # %bb.0:
721 ; AVX512BWVL-NEXT: vpsrlq $32, (%rdi), %ymm0
722 ; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi)
723 ; AVX512BWVL-NEXT: vzeroupper
724 ; AVX512BWVL-NEXT: retq
725 %vec = load <32 x i8>, ptr %L
726 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 4, i32 12, i32 20, i32 28>
727 store <4 x i8> %strided.vec, ptr %S
731 define void @shuffle_v32i8_to_v4i8_5(ptr %L, ptr %S) nounwind {
732 ; AVX1-LABEL: shuffle_v32i8_to_v4i8_5:
734 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
735 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
736 ; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
737 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
738 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
739 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
740 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
743 ; AVX2-LABEL: shuffle_v32i8_to_v4i8_5:
745 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
746 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
747 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
748 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
749 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
750 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
751 ; AVX2-NEXT: vmovd %xmm0, (%rsi)
754 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_5:
756 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
757 ; AVX512F-NEXT: vpsrlq $40, %ymm0, %ymm0
758 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
759 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
760 ; AVX512F-NEXT: vzeroupper
763 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_5:
765 ; AVX512VL-NEXT: vpsrlq $40, (%rdi), %ymm0
766 ; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi)
767 ; AVX512VL-NEXT: vzeroupper
768 ; AVX512VL-NEXT: retq
770 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_5:
772 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
773 ; AVX512BW-NEXT: vpsrlq $40, %ymm0, %ymm0
774 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
775 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
776 ; AVX512BW-NEXT: vzeroupper
777 ; AVX512BW-NEXT: retq
779 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_5:
780 ; AVX512BWVL: # %bb.0:
781 ; AVX512BWVL-NEXT: vpsrlq $40, (%rdi), %ymm0
782 ; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi)
783 ; AVX512BWVL-NEXT: vzeroupper
784 ; AVX512BWVL-NEXT: retq
785 %vec = load <32 x i8>, ptr %L
786 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 5, i32 13, i32 21, i32 29>
787 store <4 x i8> %strided.vec, ptr %S
791 define void @shuffle_v32i8_to_v4i8_6(ptr %L, ptr %S) nounwind {
792 ; AVX1-LABEL: shuffle_v32i8_to_v4i8_6:
794 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
795 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
796 ; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
797 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
798 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
799 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
800 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
803 ; AVX2-LABEL: shuffle_v32i8_to_v4i8_6:
805 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
806 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
807 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
808 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
809 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
810 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
811 ; AVX2-NEXT: vmovd %xmm0, (%rsi)
814 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_6:
816 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
817 ; AVX512F-NEXT: vpsrlq $48, %ymm0, %ymm0
818 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
819 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
820 ; AVX512F-NEXT: vzeroupper
823 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_6:
825 ; AVX512VL-NEXT: vpsrlq $48, (%rdi), %ymm0
826 ; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi)
827 ; AVX512VL-NEXT: vzeroupper
828 ; AVX512VL-NEXT: retq
830 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_6:
832 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
833 ; AVX512BW-NEXT: vpsrlq $48, %ymm0, %ymm0
834 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
835 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
836 ; AVX512BW-NEXT: vzeroupper
837 ; AVX512BW-NEXT: retq
839 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_6:
840 ; AVX512BWVL: # %bb.0:
841 ; AVX512BWVL-NEXT: vpsrlq $48, (%rdi), %ymm0
842 ; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi)
843 ; AVX512BWVL-NEXT: vzeroupper
844 ; AVX512BWVL-NEXT: retq
845 %vec = load <32 x i8>, ptr %L
846 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 6, i32 14, i32 22, i32 30>
847 store <4 x i8> %strided.vec, ptr %S
851 define void @shuffle_v32i8_to_v4i8_7(ptr %L, ptr %S) nounwind {
852 ; AVX1-LABEL: shuffle_v32i8_to_v4i8_7:
854 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
855 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
856 ; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
857 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
858 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
859 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
860 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
863 ; AVX2-LABEL: shuffle_v32i8_to_v4i8_7:
865 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
866 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
867 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
868 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
869 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
870 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
871 ; AVX2-NEXT: vmovd %xmm0, (%rsi)
874 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_7:
876 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
877 ; AVX512F-NEXT: vpsrlq $56, %ymm0, %ymm0
878 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
879 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
880 ; AVX512F-NEXT: vzeroupper
883 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_7:
885 ; AVX512VL-NEXT: vpsrlq $56, (%rdi), %ymm0
886 ; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi)
887 ; AVX512VL-NEXT: vzeroupper
888 ; AVX512VL-NEXT: retq
890 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_7:
892 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
893 ; AVX512BW-NEXT: vpsrlq $56, %ymm0, %ymm0
894 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
895 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
896 ; AVX512BW-NEXT: vzeroupper
897 ; AVX512BW-NEXT: retq
899 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_7:
900 ; AVX512BWVL: # %bb.0:
901 ; AVX512BWVL-NEXT: vpsrlq $56, (%rdi), %ymm0
902 ; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi)
903 ; AVX512BWVL-NEXT: vzeroupper
904 ; AVX512BWVL-NEXT: retq
905 %vec = load <32 x i8>, ptr %L
906 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 7, i32 15, i32 23, i32 31>
907 store <4 x i8> %strided.vec, ptr %S