1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
10 define void @shuffle_v32i8_to_v16i8_1(<32 x i8>* %L, <16 x i8>* %S) nounwind {
11 ; AVX-LABEL: shuffle_v32i8_to_v16i8_1:
13 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
14 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
15 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
16 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
17 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
18 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
19 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
22 ; AVX512-LABEL: shuffle_v32i8_to_v16i8_1:
24 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
25 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
26 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
27 ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
28 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
29 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
30 ; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
32 %vec = load <32 x i8>, <32 x i8>* %L
33 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
34 store <16 x i8> %strided.vec, <16 x i8>* %S
38 define void @shuffle_v16i16_to_v8i16_1(<16 x i16>* %L, <8 x i16>* %S) nounwind {
39 ; AVX-LABEL: shuffle_v16i16_to_v8i16_1:
41 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
42 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
43 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
44 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
45 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
46 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
47 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
50 ; AVX512-LABEL: shuffle_v16i16_to_v8i16_1:
52 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
53 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
54 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
55 ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
56 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
57 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
58 ; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
60 %vec = load <16 x i16>, <16 x i16>* %L
61 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
62 store <8 x i16> %strided.vec, <8 x i16>* %S
66 define void @shuffle_v8i32_to_v4i32_1(<8 x i32>* %L, <4 x i32>* %S) nounwind {
67 ; AVX-LABEL: shuffle_v8i32_to_v4i32_1:
69 ; AVX-NEXT: vmovaps (%rdi), %xmm0
70 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3]
71 ; AVX-NEXT: vmovaps %xmm0, (%rsi)
74 ; AVX512-LABEL: shuffle_v8i32_to_v4i32_1:
76 ; AVX512-NEXT: vmovaps (%rdi), %xmm0
77 ; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3]
78 ; AVX512-NEXT: vmovaps %xmm0, (%rsi)
80 %vec = load <8 x i32>, <8 x i32>* %L
81 %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
82 store <4 x i32> %strided.vec, <4 x i32>* %S
86 define void @shuffle_v32i8_to_v8i8_1(<32 x i8>* %L, <8 x i8>* %S) nounwind {
87 ; AVX-LABEL: shuffle_v32i8_to_v8i8_1:
89 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
90 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
91 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
92 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
93 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
94 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
95 ; AVX-NEXT: vmovq %xmm0, (%rsi)
98 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_1:
100 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
101 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
102 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
103 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
104 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
105 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
106 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
109 ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_1:
111 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
112 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
113 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
114 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
115 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
116 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
117 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
118 ; AVX512VL-NEXT: retq
120 ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_1:
122 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
123 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
124 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
125 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
126 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
127 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
128 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
129 ; AVX512BW-NEXT: retq
131 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_1:
132 ; AVX512BWVL: # %bb.0:
133 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
134 ; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
135 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,5,5,9,9,13,13,13,13,5,5,12,12,13,13]
136 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
137 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
138 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
139 ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
140 ; AVX512BWVL-NEXT: retq
141 %vec = load <32 x i8>, <32 x i8>* %L
142 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
143 store <8 x i8> %strided.vec, <8 x i8>* %S
147 define void @shuffle_v32i8_to_v8i8_2(<32 x i8>* %L, <8 x i8>* %S) nounwind {
148 ; AVX-LABEL: shuffle_v32i8_to_v8i8_2:
150 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
151 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
152 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
153 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
154 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
155 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
156 ; AVX-NEXT: vmovq %xmm0, (%rsi)
159 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_2:
161 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
162 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
163 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
164 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
165 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
166 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
167 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
170 ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_2:
172 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
173 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
174 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
175 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
176 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
177 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
178 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
179 ; AVX512VL-NEXT: retq
181 ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_2:
183 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
184 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
185 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
186 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
187 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
188 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
189 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
190 ; AVX512BW-NEXT: retq
192 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_2:
193 ; AVX512BWVL: # %bb.0:
194 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
195 ; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
196 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
197 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
198 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
199 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
200 ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
201 ; AVX512BWVL-NEXT: retq
202 %vec = load <32 x i8>, <32 x i8>* %L
203 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
204 store <8 x i8> %strided.vec, <8 x i8>* %S
208 define void @shuffle_v32i8_to_v8i8_3(<32 x i8>* %L, <8 x i8>* %S) nounwind {
209 ; AVX-LABEL: shuffle_v32i8_to_v8i8_3:
211 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
212 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
213 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
214 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
215 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
216 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
217 ; AVX-NEXT: vmovq %xmm0, (%rsi)
220 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_3:
222 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
223 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
224 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
225 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
226 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
227 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
228 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
231 ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_3:
233 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
234 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
235 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
236 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
237 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
238 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
239 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
240 ; AVX512VL-NEXT: retq
242 ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_3:
244 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
245 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
246 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
247 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
248 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
249 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
250 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
251 ; AVX512BW-NEXT: retq
253 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_3:
254 ; AVX512BWVL: # %bb.0:
255 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
256 ; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
257 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,7,7,11,11,15,15,7,7,15,15,6,6,7,7]
258 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
259 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
260 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
261 ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
262 ; AVX512BWVL-NEXT: retq
263 %vec = load <32 x i8>, <32 x i8>* %L
264 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
265 store <8 x i8> %strided.vec, <8 x i8>* %S
269 define void @shuffle_v16i16_to_v4i16_1(<16 x i16>* %L, <4 x i16>* %S) nounwind {
270 ; AVX1-LABEL: shuffle_v16i16_to_v4i16_1:
272 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
273 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
274 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
275 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
276 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
277 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
280 ; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_1:
281 ; AVX2-SLOW: # %bb.0:
282 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
283 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
284 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
285 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
286 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
287 ; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi)
288 ; AVX2-SLOW-NEXT: retq
290 ; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_1:
291 ; AVX2-FAST: # %bb.0:
292 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
293 ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
294 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
295 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
296 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
297 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
298 ; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi)
299 ; AVX2-FAST-NEXT: retq
301 ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_1:
303 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
304 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
305 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
306 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
307 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
308 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
311 ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_1:
313 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
314 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
315 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
316 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
317 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
318 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
319 ; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
320 ; AVX512VL-NEXT: retq
322 ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_1:
324 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
325 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
326 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
327 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
328 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
329 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
330 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
331 ; AVX512BW-NEXT: retq
333 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_1:
334 ; AVX512BWVL: # %bb.0:
335 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
336 ; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
337 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
338 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
339 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
340 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
341 ; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
342 ; AVX512BWVL-NEXT: retq
343 %vec = load <16 x i16>, <16 x i16>* %L
344 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
345 store <4 x i16> %strided.vec, <4 x i16>* %S
349 define void @shuffle_v16i16_to_v4i16_2(<16 x i16>* %L, <4 x i16>* %S) nounwind {
350 ; AVX1-LABEL: shuffle_v16i16_to_v4i16_2:
352 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
353 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
354 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
355 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
356 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
357 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
360 ; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_2:
361 ; AVX2-SLOW: # %bb.0:
362 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
363 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
364 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
365 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
366 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
367 ; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi)
368 ; AVX2-SLOW-NEXT: retq
370 ; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_2:
371 ; AVX2-FAST: # %bb.0:
372 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
373 ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
374 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
375 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
376 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
377 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
378 ; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi)
379 ; AVX2-FAST-NEXT: retq
381 ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_2:
383 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
384 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
385 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
386 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
387 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
388 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
391 ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_2:
393 ; AVX512VL-NEXT: vmovaps (%rdi), %xmm0
394 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3]
395 ; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
396 ; AVX512VL-NEXT: retq
398 ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_2:
400 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
401 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
402 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
403 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
404 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
405 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
406 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
407 ; AVX512BW-NEXT: retq
409 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_2:
410 ; AVX512BWVL: # %bb.0:
411 ; AVX512BWVL-NEXT: vmovaps (%rdi), %xmm0
412 ; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3]
413 ; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
414 ; AVX512BWVL-NEXT: retq
415 %vec = load <16 x i16>, <16 x i16>* %L
416 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
417 store <4 x i16> %strided.vec, <4 x i16>* %S
421 define void @shuffle_v16i16_to_v4i16_3(<16 x i16>* %L, <4 x i16>* %S) nounwind {
422 ; AVX1-LABEL: shuffle_v16i16_to_v4i16_3:
424 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
425 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
426 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
427 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
428 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
429 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
432 ; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_3:
433 ; AVX2-SLOW: # %bb.0:
434 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
435 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
436 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
437 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
438 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
439 ; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi)
440 ; AVX2-SLOW-NEXT: retq
442 ; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_3:
443 ; AVX2-FAST: # %bb.0:
444 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
445 ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
446 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
447 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
448 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
449 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
450 ; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi)
451 ; AVX2-FAST-NEXT: retq
453 ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_3:
455 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
456 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
457 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
458 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
459 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
460 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
463 ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_3:
465 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
466 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
467 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
468 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
469 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
470 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
471 ; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
472 ; AVX512VL-NEXT: retq
474 ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_3:
476 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
477 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
478 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
479 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
480 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
481 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
482 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
483 ; AVX512BW-NEXT: retq
485 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_3:
486 ; AVX512BWVL: # %bb.0:
487 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
488 ; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
489 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
490 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
491 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
492 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
493 ; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
494 ; AVX512BWVL-NEXT: retq
495 %vec = load <16 x i16>, <16 x i16>* %L
496 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
497 store <4 x i16> %strided.vec, <4 x i16>* %S
501 define void @shuffle_v32i8_to_v4i8_1(<32 x i8>* %L, <4 x i8>* %S) nounwind {
502 ; AVX-LABEL: shuffle_v32i8_to_v4i8_1:
504 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
505 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
506 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
507 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
508 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
509 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
510 ; AVX-NEXT: vmovd %xmm0, (%rsi)
513 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_1:
515 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
516 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
517 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
518 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
519 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
520 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
521 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
524 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_1:
526 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
527 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
528 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,255,255,9,9,255,255,9,9,255,255,11,11,255,255]
529 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
530 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
531 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
532 ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
533 ; AVX512VL-NEXT: retq
535 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_1:
537 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
538 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
539 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
540 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
541 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
542 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
543 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
544 ; AVX512BW-NEXT: retq
546 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_1:
547 ; AVX512BWVL: # %bb.0:
548 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
549 ; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
550 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,255,255,9,9,255,255,9,9,255,255,11,11,255,255]
551 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
552 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
553 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
554 ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
555 ; AVX512BWVL-NEXT: retq
556 %vec = load <32 x i8>, <32 x i8>* %L
557 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 1, i32 9, i32 17, i32 25>
558 store <4 x i8> %strided.vec, <4 x i8>* %S
562 define void @shuffle_v32i8_to_v4i8_2(<32 x i8>* %L, <4 x i8>* %S) nounwind {
563 ; AVX-LABEL: shuffle_v32i8_to_v4i8_2:
565 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
566 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
567 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
568 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
569 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
570 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
571 ; AVX-NEXT: vmovd %xmm0, (%rsi)
574 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_2:
576 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
577 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
578 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
579 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
580 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
581 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
582 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
585 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_2:
587 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
588 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
589 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
590 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
591 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
592 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
593 ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
594 ; AVX512VL-NEXT: retq
596 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_2:
598 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
599 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
600 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
601 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
602 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
603 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
604 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
605 ; AVX512BW-NEXT: retq
607 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_2:
608 ; AVX512BWVL: # %bb.0:
609 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
610 ; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
611 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
612 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
613 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
614 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
615 ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
616 ; AVX512BWVL-NEXT: retq
617 %vec = load <32 x i8>, <32 x i8>* %L
618 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 2, i32 10, i32 18, i32 26>
619 store <4 x i8> %strided.vec, <4 x i8>* %S
623 define void @shuffle_v32i8_to_v4i8_3(<32 x i8>* %L, <4 x i8>* %S) nounwind {
624 ; AVX-LABEL: shuffle_v32i8_to_v4i8_3:
626 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
627 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
628 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
629 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
630 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
631 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
632 ; AVX-NEXT: vmovd %xmm0, (%rsi)
635 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_3:
637 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
638 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
639 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
640 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
641 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
642 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
643 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
646 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_3:
648 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
649 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
650 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,11,11,11,11,3,3,8,8,9,9,10,10,11,11]
651 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
652 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
653 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
654 ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
655 ; AVX512VL-NEXT: retq
657 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_3:
659 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
660 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
661 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
662 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
663 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
664 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
665 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
666 ; AVX512BW-NEXT: retq
668 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_3:
669 ; AVX512BWVL: # %bb.0:
670 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
671 ; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
672 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,11,11,11,11,3,3,8,8,9,9,10,10,11,11]
673 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
674 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
675 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
676 ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
677 ; AVX512BWVL-NEXT: retq
678 %vec = load <32 x i8>, <32 x i8>* %L
679 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 3, i32 11, i32 19, i32 27>
680 store <4 x i8> %strided.vec, <4 x i8>* %S
684 define void @shuffle_v32i8_to_v4i8_4(<32 x i8>* %L, <4 x i8>* %S) nounwind {
685 ; AVX-LABEL: shuffle_v32i8_to_v4i8_4:
687 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
688 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
689 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
690 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
691 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
692 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
693 ; AVX-NEXT: vmovd %xmm0, (%rsi)
696 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_4:
698 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
699 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
700 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
701 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
702 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
703 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
704 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
707 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_4:
709 ; AVX512VL-NEXT: vmovaps (%rdi), %xmm0
710 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3]
711 ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
712 ; AVX512VL-NEXT: retq
714 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_4:
716 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
717 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
718 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
719 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
720 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
721 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
722 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
723 ; AVX512BW-NEXT: retq
725 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_4:
726 ; AVX512BWVL: # %bb.0:
727 ; AVX512BWVL-NEXT: vmovaps (%rdi), %xmm0
728 ; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3]
729 ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
730 ; AVX512BWVL-NEXT: retq
731 %vec = load <32 x i8>, <32 x i8>* %L
732 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 4, i32 12, i32 20, i32 28>
733 store <4 x i8> %strided.vec, <4 x i8>* %S
737 define void @shuffle_v32i8_to_v4i8_5(<32 x i8>* %L, <4 x i8>* %S) nounwind {
738 ; AVX-LABEL: shuffle_v32i8_to_v4i8_5:
740 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
741 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
742 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
743 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
744 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
745 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
746 ; AVX-NEXT: vmovd %xmm0, (%rsi)
749 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_5:
751 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
752 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
753 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
754 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
755 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
756 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
757 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
760 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_5:
762 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
763 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
764 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [5,5,13,13,13,13,5,5,4,4,5,5,6,6,7,7]
765 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
766 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
767 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
768 ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
769 ; AVX512VL-NEXT: retq
771 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_5:
773 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
774 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
775 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
776 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
777 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
778 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
779 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
780 ; AVX512BW-NEXT: retq
782 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_5:
783 ; AVX512BWVL: # %bb.0:
784 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
785 ; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
786 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [5,5,13,13,13,13,5,5,4,4,5,5,6,6,7,7]
787 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
788 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
789 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
790 ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
791 ; AVX512BWVL-NEXT: retq
792 %vec = load <32 x i8>, <32 x i8>* %L
793 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 5, i32 13, i32 21, i32 29>
794 store <4 x i8> %strided.vec, <4 x i8>* %S
798 define void @shuffle_v32i8_to_v4i8_6(<32 x i8>* %L, <4 x i8>* %S) nounwind {
799 ; AVX-LABEL: shuffle_v32i8_to_v4i8_6:
801 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
802 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
803 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
804 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
805 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
806 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
807 ; AVX-NEXT: vmovd %xmm0, (%rsi)
810 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_6:
812 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
813 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
814 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
815 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
816 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
817 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
818 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
821 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_6:
823 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
824 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
825 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
826 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
827 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
828 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
829 ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
830 ; AVX512VL-NEXT: retq
832 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_6:
834 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
835 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
836 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
837 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
838 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
839 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
840 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
841 ; AVX512BW-NEXT: retq
843 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_6:
844 ; AVX512BWVL: # %bb.0:
845 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
846 ; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
847 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
848 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
849 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
850 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
851 ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
852 ; AVX512BWVL-NEXT: retq
853 %vec = load <32 x i8>, <32 x i8>* %L
854 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 6, i32 14, i32 22, i32 30>
855 store <4 x i8> %strided.vec, <4 x i8>* %S
859 define void @shuffle_v32i8_to_v4i8_7(<32 x i8>* %L, <4 x i8>* %S) nounwind {
860 ; AVX-LABEL: shuffle_v32i8_to_v4i8_7:
862 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
863 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
864 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
865 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
866 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
867 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
868 ; AVX-NEXT: vmovd %xmm0, (%rsi)
871 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_7:
873 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
874 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
875 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
876 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
877 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
878 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
879 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
882 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_7:
884 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
885 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
886 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,14,14,15,15,14,14,15,15,4,4,5,5,6,6]
887 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
888 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
889 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
890 ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
891 ; AVX512VL-NEXT: retq
893 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_7:
895 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
896 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
897 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
898 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
899 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
900 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
901 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
902 ; AVX512BW-NEXT: retq
904 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_7:
905 ; AVX512BWVL: # %bb.0:
906 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
907 ; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
908 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,14,14,15,15,14,14,15,15,4,4,5,5,6,6]
909 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
910 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
911 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
912 ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
913 ; AVX512BWVL-NEXT: retq
914 %vec = load <32 x i8>, <32 x i8>* %L
915 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 7, i32 15, i32 23, i32 31>
916 store <4 x i8> %strided.vec, <4 x i8>* %S