1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL,AVX512BWVL-FAST-ALL
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL,AVX512BWVL-FAST-PERLANE
10 define void @shuffle_v64i8_to_v32i8_1(<64 x i8>* %L, <32 x i8>* %S) nounwind {
11 ; AVX512F-LABEL: shuffle_v64i8_to_v32i8_1:
13 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
14 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
15 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
16 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
17 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
18 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
19 ; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
20 ; AVX512F-NEXT: vzeroupper
23 ; AVX512BWVL-FAST-ALL-LABEL: shuffle_v64i8_to_v32i8_1:
24 ; AVX512BWVL-FAST-ALL: # %bb.0:
25 ; AVX512BWVL-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm0
26 ; AVX512BWVL-FAST-ALL-NEXT: vmovdqa 32(%rdi), %ymm1
27 ; AVX512BWVL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
28 ; AVX512BWVL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
29 ; AVX512BWVL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
30 ; AVX512BWVL-FAST-ALL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
31 ; AVX512BWVL-FAST-ALL-NEXT: vmovdqa %ymm2, (%rsi)
32 ; AVX512BWVL-FAST-ALL-NEXT: vzeroupper
33 ; AVX512BWVL-FAST-ALL-NEXT: retq
35 ; AVX512BWVL-FAST-PERLANE-LABEL: shuffle_v64i8_to_v32i8_1:
36 ; AVX512BWVL-FAST-PERLANE: # %bb.0:
37 ; AVX512BWVL-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0
38 ; AVX512BWVL-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1
39 ; AVX512BWVL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
40 ; AVX512BWVL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
41 ; AVX512BWVL-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
42 ; AVX512BWVL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
43 ; AVX512BWVL-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rsi)
44 ; AVX512BWVL-FAST-PERLANE-NEXT: vzeroupper
45 ; AVX512BWVL-FAST-PERLANE-NEXT: retq
46 %vec = load <64 x i8>, <64 x i8>* %L
47 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
48 store <32 x i8> %strided.vec, <32 x i8>* %S
52 define void @shuffle_v32i16_to_v16i16_1(<32 x i16>* %L, <16 x i16>* %S) nounwind {
53 ; AVX512F-LABEL: shuffle_v32i16_to_v16i16_1:
55 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
56 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
57 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31]
58 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u]
59 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
60 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
61 ; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
62 ; AVX512F-NEXT: vzeroupper
65 ; AVX512BW-LABEL: shuffle_v32i16_to_v16i16_1:
67 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,33,35,37,39,9,11,13,15,41,43,45,47]
68 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
69 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2
70 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
71 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,2,1,3]
72 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
73 ; AVX512BW-NEXT: vzeroupper
76 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16_1:
77 ; AVX512BWVL: # %bb.0:
78 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
79 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
80 ; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
81 ; AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi)
82 ; AVX512BWVL-NEXT: vzeroupper
83 ; AVX512BWVL-NEXT: retq
84 %vec = load <32 x i16>, <32 x i16>* %L
85 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
86 store <16 x i16> %strided.vec, <16 x i16>* %S
90 define void @shuffle_v16i32_to_v8i32_1(<16 x i32>* %L, <8 x i32>* %S) nounwind {
91 ; AVX512F-LABEL: shuffle_v16i32_to_v8i32_1:
93 ; AVX512F-NEXT: vmovaps (%rdi), %ymm0
94 ; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],mem[1,3],ymm0[5,7],mem[5,7]
95 ; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
96 ; AVX512F-NEXT: vmovaps %ymm0, (%rsi)
97 ; AVX512F-NEXT: vzeroupper
100 ; AVX512BWVL-FAST-ALL-LABEL: shuffle_v16i32_to_v8i32_1:
101 ; AVX512BWVL-FAST-ALL: # %bb.0:
102 ; AVX512BWVL-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm0
103 ; AVX512BWVL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15]
104 ; AVX512BWVL-FAST-ALL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1
105 ; AVX512BWVL-FAST-ALL-NEXT: vmovdqa %ymm1, (%rsi)
106 ; AVX512BWVL-FAST-ALL-NEXT: vzeroupper
107 ; AVX512BWVL-FAST-ALL-NEXT: retq
109 ; AVX512BWVL-FAST-PERLANE-LABEL: shuffle_v16i32_to_v8i32_1:
110 ; AVX512BWVL-FAST-PERLANE: # %bb.0:
111 ; AVX512BWVL-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm0
112 ; AVX512BWVL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],mem[1,3],ymm0[5,7],mem[5,7]
113 ; AVX512BWVL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
114 ; AVX512BWVL-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rsi)
115 ; AVX512BWVL-FAST-PERLANE-NEXT: vzeroupper
116 ; AVX512BWVL-FAST-PERLANE-NEXT: retq
117 %vec = load <16 x i32>, <16 x i32>* %L
118 %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
119 store <8 x i32> %strided.vec, <8 x i32>* %S
123 define void @shuffle_v64i8_to_v16i8_1(<64 x i8>* %L, <16 x i8>* %S) nounwind {
124 ; AVX512-LABEL: shuffle_v64i8_to_v16i8_1:
126 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
127 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
128 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
129 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
130 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
131 ; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
132 ; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
133 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
134 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
135 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
136 ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
137 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
138 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
139 ; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
141 %vec = load <64 x i8>, <64 x i8>* %L
142 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
143 store <16 x i8> %strided.vec, <16 x i8>* %S
147 define void @shuffle_v64i8_to_v16i8_2(<64 x i8>* %L, <16 x i8>* %S) nounwind {
148 ; AVX512-LABEL: shuffle_v64i8_to_v16i8_2:
150 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
151 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
152 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
153 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
154 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
155 ; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
156 ; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
157 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
158 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
159 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
160 ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
161 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
162 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
163 ; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
165 %vec = load <64 x i8>, <64 x i8>* %L
166 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
167 store <16 x i8> %strided.vec, <16 x i8>* %S
171 define void @shuffle_v64i8_to_v16i8_3(<64 x i8>* %L, <16 x i8>* %S) nounwind {
172 ; AVX512-LABEL: shuffle_v64i8_to_v16i8_3:
174 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
175 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
176 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
177 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
178 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
179 ; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
180 ; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
181 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
182 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
183 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
184 ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
185 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
186 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
187 ; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
189 %vec = load <64 x i8>, <64 x i8>* %L
190 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
191 store <16 x i8> %strided.vec, <16 x i8>* %S
195 define void @shuffle_v32i16_to_v8i16_1(<32 x i16>* %L, <8 x i16>* %S) nounwind {
196 ; AVX512F-LABEL: shuffle_v32i16_to_v8i16_1:
198 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
199 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
200 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
201 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
202 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
203 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
204 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
205 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = mem[0,2,2,3]
206 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
207 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
208 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
209 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
212 ; AVX512VL-LABEL: shuffle_v32i16_to_v8i16_1:
214 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
215 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
216 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
217 ; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
218 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
219 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
220 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
221 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
222 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
223 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
224 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
225 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
226 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
227 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
228 ; AVX512VL-NEXT: retq
230 ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_1:
232 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <1,5,9,13,33,37,41,45,u,u,u,u,u,u,u,u>
233 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
234 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2
235 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
236 ; AVX512BW-NEXT: vmovdqa %xmm1, (%rsi)
237 ; AVX512BW-NEXT: vzeroupper
238 ; AVX512BW-NEXT: retq
240 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_1:
241 ; AVX512BWVL: # %bb.0:
242 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [1,5,9,13,17,21,25,29]
243 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
244 ; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
245 ; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi)
246 ; AVX512BWVL-NEXT: vzeroupper
247 ; AVX512BWVL-NEXT: retq
248 %vec = load <32 x i16>, <32 x i16>* %L
249 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
250 store <8 x i16> %strided.vec, <8 x i16>* %S
254 define void @shuffle_v32i16_to_v8i16_2(<32 x i16>* %L, <8 x i16>* %S) nounwind {
255 ; AVX512F-LABEL: shuffle_v32i16_to_v8i16_2:
257 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
258 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
259 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
260 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
261 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
262 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
263 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
264 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = mem[3,1,2,3]
265 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
266 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
267 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
268 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
271 ; AVX512VL-LABEL: shuffle_v32i16_to_v8i16_2:
273 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
274 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
275 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
276 ; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
277 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [12,13,14,15,4,5,12,13,8,9,10,11,12,13,14,15]
278 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
279 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
280 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
281 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
282 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
283 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
284 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
285 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
286 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
287 ; AVX512VL-NEXT: retq
289 ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_2:
291 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <2,6,10,14,34,38,42,46,u,u,u,u,u,u,u,u>
292 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
293 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2
294 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
295 ; AVX512BW-NEXT: vmovdqa %xmm1, (%rsi)
296 ; AVX512BW-NEXT: vzeroupper
297 ; AVX512BW-NEXT: retq
299 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_2:
300 ; AVX512BWVL: # %bb.0:
301 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [2,6,10,14,18,22,26,30]
302 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
303 ; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
304 ; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi)
305 ; AVX512BWVL-NEXT: vzeroupper
306 ; AVX512BWVL-NEXT: retq
307 %vec = load <32 x i16>, <32 x i16>* %L
308 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
309 store <8 x i16> %strided.vec, <8 x i16>* %S
313 define void @shuffle_v32i16_to_v8i16_3(<32 x i16>* %L, <8 x i16>* %S) nounwind {
314 ; AVX512F-LABEL: shuffle_v32i16_to_v8i16_3:
316 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
317 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
318 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
319 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
320 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
321 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
322 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
323 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = mem[3,1,2,3]
324 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
325 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
326 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
327 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
330 ; AVX512VL-LABEL: shuffle_v32i16_to_v8i16_3:
332 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
333 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
334 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
335 ; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
336 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
337 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
338 ; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
339 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
340 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
341 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
342 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
343 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
344 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
345 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
346 ; AVX512VL-NEXT: retq
348 ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_3:
350 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <3,7,11,15,35,39,43,47,u,u,u,u,u,u,u,u>
351 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
352 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2
353 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
354 ; AVX512BW-NEXT: vmovdqa %xmm1, (%rsi)
355 ; AVX512BW-NEXT: vzeroupper
356 ; AVX512BW-NEXT: retq
358 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_3:
359 ; AVX512BWVL: # %bb.0:
360 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [3,7,11,15,19,23,27,31]
361 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
362 ; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
363 ; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi)
364 ; AVX512BWVL-NEXT: vzeroupper
365 ; AVX512BWVL-NEXT: retq
366 %vec = load <32 x i16>, <32 x i16>* %L
367 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
368 store <8 x i16> %strided.vec, <8 x i16>* %S
372 define void @shuffle_v64i8_to_v8i8_1(<64 x i8>* %L, <8 x i8>* %S) nounwind {
373 ; AVX512-LABEL: shuffle_v64i8_to_v8i8_1:
375 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
376 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
377 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
378 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
379 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
380 ; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
381 ; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
382 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
383 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
384 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
385 ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
386 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
387 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
388 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
390 %vec = load <64 x i8>, <64 x i8>* %L
391 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57>
392 store <8 x i8> %strided.vec, <8 x i8>* %S
396 define void @shuffle_v64i8_to_v8i8_2(<64 x i8>* %L, <8 x i8>* %S) nounwind {
397 ; AVX512-LABEL: shuffle_v64i8_to_v8i8_2:
399 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
400 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
401 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
402 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
403 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
404 ; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
405 ; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
406 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
407 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
408 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
409 ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
410 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
411 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
412 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
414 %vec = load <64 x i8>, <64 x i8>* %L
415 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58>
416 store <8 x i8> %strided.vec, <8 x i8>* %S
420 define void @shuffle_v64i8_to_v8i8_3(<64 x i8>* %L, <8 x i8>* %S) nounwind {
421 ; AVX512-LABEL: shuffle_v64i8_to_v8i8_3:
423 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
424 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
425 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
426 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
427 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
428 ; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
429 ; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
430 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
431 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
432 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
433 ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
434 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
435 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
436 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
438 %vec = load <64 x i8>, <64 x i8>* %L
439 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59>
440 store <8 x i8> %strided.vec, <8 x i8>* %S
444 define void @shuffle_v64i8_to_v8i8_4(<64 x i8>* %L, <8 x i8>* %S) nounwind {
445 ; AVX512-LABEL: shuffle_v64i8_to_v8i8_4:
447 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
448 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
449 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
450 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
451 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
452 ; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
453 ; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
454 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
455 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
456 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
457 ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
458 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
459 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
460 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
462 %vec = load <64 x i8>, <64 x i8>* %L
463 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60>
464 store <8 x i8> %strided.vec, <8 x i8>* %S
468 define void @shuffle_v64i8_to_v8i8_5(<64 x i8>* %L, <8 x i8>* %S) nounwind {
469 ; AVX512-LABEL: shuffle_v64i8_to_v8i8_5:
471 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
472 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
473 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
474 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
475 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
476 ; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
477 ; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
478 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
479 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
480 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
481 ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
482 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
483 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
484 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
486 %vec = load <64 x i8>, <64 x i8>* %L
487 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61>
488 store <8 x i8> %strided.vec, <8 x i8>* %S
492 define void @shuffle_v64i8_to_v8i8_6(<64 x i8>* %L, <8 x i8>* %S) nounwind {
493 ; AVX512-LABEL: shuffle_v64i8_to_v8i8_6:
495 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
496 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
497 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
498 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
499 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
500 ; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
501 ; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
502 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
503 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
504 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
505 ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
506 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
507 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
508 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
510 %vec = load <64 x i8>, <64 x i8>* %L
511 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62>
512 store <8 x i8> %strided.vec, <8 x i8>* %S
516 define void @shuffle_v64i8_to_v8i8_7(<64 x i8>* %L, <8 x i8>* %S) nounwind {
517 ; AVX512-LABEL: shuffle_v64i8_to_v8i8_7:
519 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
520 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
521 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
522 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
523 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
524 ; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
525 ; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
526 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
527 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
528 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
529 ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
530 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
531 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
532 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
534 %vec = load <64 x i8>, <64 x i8>* %L
535 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63>
536 store <8 x i8> %strided.vec, <8 x i8>* %S