1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
12 define void @shuffle_v16i8_to_v8i8_1(<16 x i8>* %L, <8 x i8>* %S) nounwind {
13 ; SSE2-LABEL: shuffle_v16i8_to_v8i8_1:
15 ; SSE2-NEXT: movdqa (%rdi), %xmm0
16 ; SSE2-NEXT: pxor %xmm1, %xmm1
17 ; SSE2-NEXT: movdqa %xmm0, %xmm2
18 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
19 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
20 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7]
21 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
22 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
23 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
24 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
25 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
26 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
27 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
28 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
29 ; SSE2-NEXT: packuswb %xmm0, %xmm0
30 ; SSE2-NEXT: movq %xmm0, (%rsi)
33 ; SSE42-LABEL: shuffle_v16i8_to_v8i8_1:
35 ; SSE42-NEXT: movdqa (%rdi), %xmm0
36 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
37 ; SSE42-NEXT: movq %xmm0, (%rsi)
40 ; AVX-LABEL: shuffle_v16i8_to_v8i8_1:
42 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
43 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
44 ; AVX-NEXT: vmovq %xmm0, (%rsi)
47 ; AVX512F-LABEL: shuffle_v16i8_to_v8i8_1:
49 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
50 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
51 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
54 ; AVX512VL-LABEL: shuffle_v16i8_to_v8i8_1:
56 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
57 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
58 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
61 ; AVX512BW-LABEL: shuffle_v16i8_to_v8i8_1:
63 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
64 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
65 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
68 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v8i8_1:
69 ; AVX512BWVL: # %bb.0:
70 ; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %xmm0
71 ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
72 ; AVX512BWVL-NEXT: retq
73 %vec = load <16 x i8>, <16 x i8>* %L
74 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
75 store <8 x i8> %strided.vec, <8 x i8>* %S
79 define void @shuffle_v8i16_to_v4i16_1(<8 x i16>* %L, <4 x i16>* %S) nounwind {
80 ; SSE2-LABEL: shuffle_v8i16_to_v4i16_1:
82 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[3,1,2,3,4,5,6,7]
83 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
84 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
85 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
86 ; SSE2-NEXT: movq %xmm0, (%rsi)
89 ; SSE42-LABEL: shuffle_v8i16_to_v4i16_1:
91 ; SSE42-NEXT: movdqa (%rdi), %xmm0
92 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
93 ; SSE42-NEXT: movq %xmm0, (%rsi)
96 ; AVX-LABEL: shuffle_v8i16_to_v4i16_1:
98 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
99 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
100 ; AVX-NEXT: vmovq %xmm0, (%rsi)
103 ; AVX512F-LABEL: shuffle_v8i16_to_v4i16_1:
105 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
106 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
107 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
110 ; AVX512VL-LABEL: shuffle_v8i16_to_v4i16_1:
112 ; AVX512VL-NEXT: vpsrld $16, (%rdi), %xmm0
113 ; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
114 ; AVX512VL-NEXT: retq
116 ; AVX512BW-LABEL: shuffle_v8i16_to_v4i16_1:
118 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
119 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
120 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
121 ; AVX512BW-NEXT: retq
123 ; AVX512BWVL-LABEL: shuffle_v8i16_to_v4i16_1:
124 ; AVX512BWVL: # %bb.0:
125 ; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %xmm0
126 ; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
127 ; AVX512BWVL-NEXT: retq
128 %vec = load <8 x i16>, <8 x i16>* %L
129 %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
130 store <4 x i16> %strided.vec, <4 x i16>* %S
134 define void @shuffle_v4i32_to_v2i32_1(<4 x i32>* %L, <2 x i32>* %S) nounwind {
135 ; SSE-LABEL: shuffle_v4i32_to_v2i32_1:
137 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[1,3,2,3]
138 ; SSE-NEXT: movq %xmm0, (%rsi)
141 ; AVX-LABEL: shuffle_v4i32_to_v2i32_1:
143 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3]
144 ; AVX-NEXT: vmovlps %xmm0, (%rsi)
147 ; AVX512F-LABEL: shuffle_v4i32_to_v2i32_1:
149 ; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3]
150 ; AVX512F-NEXT: vmovlps %xmm0, (%rsi)
153 ; AVX512VL-LABEL: shuffle_v4i32_to_v2i32_1:
155 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3]
156 ; AVX512VL-NEXT: vpmovqd %xmm0, (%rsi)
157 ; AVX512VL-NEXT: retq
159 ; AVX512BW-LABEL: shuffle_v4i32_to_v2i32_1:
161 ; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3]
162 ; AVX512BW-NEXT: vmovlps %xmm0, (%rsi)
163 ; AVX512BW-NEXT: retq
165 ; AVX512BWVL-LABEL: shuffle_v4i32_to_v2i32_1:
166 ; AVX512BWVL: # %bb.0:
167 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3]
168 ; AVX512BWVL-NEXT: vpmovqd %xmm0, (%rsi)
169 ; AVX512BWVL-NEXT: retq
170 %vec = load <4 x i32>, <4 x i32>* %L
171 %strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
172 store <2 x i32> %strided.vec, <2 x i32>* %S
176 define void @shuffle_v16i8_to_v4i8_1(<16 x i8>* %L, <4 x i8>* %S) nounwind {
177 ; SSE2-LABEL: shuffle_v16i8_to_v4i8_1:
179 ; SSE2-NEXT: movdqa (%rdi), %xmm0
180 ; SSE2-NEXT: pxor %xmm1, %xmm1
181 ; SSE2-NEXT: movdqa %xmm0, %xmm2
182 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
183 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
184 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
185 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
186 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
187 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
188 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
189 ; SSE2-NEXT: packuswb %xmm0, %xmm0
190 ; SSE2-NEXT: movd %xmm0, (%rsi)
193 ; SSE42-LABEL: shuffle_v16i8_to_v4i8_1:
195 ; SSE42-NEXT: movdqa (%rdi), %xmm0
196 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
197 ; SSE42-NEXT: movd %xmm0, (%rsi)
200 ; AVX-LABEL: shuffle_v16i8_to_v4i8_1:
202 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
203 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
204 ; AVX-NEXT: vmovd %xmm0, (%rsi)
207 ; AVX512F-LABEL: shuffle_v16i8_to_v4i8_1:
209 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
210 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
211 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
214 ; AVX512VL-LABEL: shuffle_v16i8_to_v4i8_1:
216 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
217 ; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0
218 ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
219 ; AVX512VL-NEXT: retq
221 ; AVX512BW-LABEL: shuffle_v16i8_to_v4i8_1:
223 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
224 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
225 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
226 ; AVX512BW-NEXT: retq
228 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v4i8_1:
229 ; AVX512BWVL: # %bb.0:
230 ; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %xmm0
231 ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
232 ; AVX512BWVL-NEXT: retq
233 %vec = load <16 x i8>, <16 x i8>* %L
234 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
235 store <4 x i8> %strided.vec, <4 x i8>* %S
239 define void @shuffle_v16i8_to_v4i8_2(<16 x i8>* %L, <4 x i8>* %S) nounwind {
240 ; SSE2-LABEL: shuffle_v16i8_to_v4i8_2:
242 ; SSE2-NEXT: movdqa (%rdi), %xmm0
243 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
244 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
245 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
246 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
247 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
248 ; SSE2-NEXT: packuswb %xmm0, %xmm0
249 ; SSE2-NEXT: movd %xmm0, (%rsi)
252 ; SSE42-LABEL: shuffle_v16i8_to_v4i8_2:
254 ; SSE42-NEXT: movdqa (%rdi), %xmm0
255 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
256 ; SSE42-NEXT: movd %xmm0, (%rsi)
259 ; AVX-LABEL: shuffle_v16i8_to_v4i8_2:
261 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
262 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
263 ; AVX-NEXT: vmovd %xmm0, (%rsi)
266 ; AVX512F-LABEL: shuffle_v16i8_to_v4i8_2:
268 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
269 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
270 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
273 ; AVX512VL-LABEL: shuffle_v16i8_to_v4i8_2:
275 ; AVX512VL-NEXT: vpsrld $16, (%rdi), %xmm0
276 ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
277 ; AVX512VL-NEXT: retq
279 ; AVX512BW-LABEL: shuffle_v16i8_to_v4i8_2:
281 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
282 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
283 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
284 ; AVX512BW-NEXT: retq
286 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v4i8_2:
287 ; AVX512BWVL: # %bb.0:
288 ; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %xmm0
289 ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
290 ; AVX512BWVL-NEXT: retq
291 %vec = load <16 x i8>, <16 x i8>* %L
292 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
293 store <4 x i8> %strided.vec, <4 x i8>* %S
297 define void @shuffle_v16i8_to_v4i8_3(<16 x i8>* %L, <4 x i8>* %S) nounwind {
298 ; SSE2-LABEL: shuffle_v16i8_to_v4i8_3:
300 ; SSE2-NEXT: movdqa (%rdi), %xmm0
301 ; SSE2-NEXT: pxor %xmm1, %xmm1
302 ; SSE2-NEXT: movdqa %xmm0, %xmm2
303 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
304 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
305 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
306 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
307 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
308 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
309 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
310 ; SSE2-NEXT: packuswb %xmm0, %xmm0
311 ; SSE2-NEXT: movd %xmm0, (%rsi)
314 ; SSE42-LABEL: shuffle_v16i8_to_v4i8_3:
316 ; SSE42-NEXT: movdqa (%rdi), %xmm0
317 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u]
318 ; SSE42-NEXT: movd %xmm0, (%rsi)
321 ; AVX-LABEL: shuffle_v16i8_to_v4i8_3:
323 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
324 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u]
325 ; AVX-NEXT: vmovd %xmm0, (%rsi)
328 ; AVX512F-LABEL: shuffle_v16i8_to_v4i8_3:
330 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
331 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u]
332 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
335 ; AVX512VL-LABEL: shuffle_v16i8_to_v4i8_3:
337 ; AVX512VL-NEXT: vpsrld $24, (%rdi), %xmm0
338 ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
339 ; AVX512VL-NEXT: retq
341 ; AVX512BW-LABEL: shuffle_v16i8_to_v4i8_3:
343 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
344 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u]
345 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
346 ; AVX512BW-NEXT: retq
348 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v4i8_3:
349 ; AVX512BWVL: # %bb.0:
350 ; AVX512BWVL-NEXT: vpsrld $24, (%rdi), %xmm0
351 ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
352 ; AVX512BWVL-NEXT: retq
353 %vec = load <16 x i8>, <16 x i8>* %L
354 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
355 store <4 x i8> %strided.vec, <4 x i8>* %S
359 define void @shuffle_v8i16_to_v2i16_1(<8 x i16>* %L, <2 x i16>* %S) nounwind {
360 ; SSE-LABEL: shuffle_v8i16_to_v2i16_1:
362 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
363 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
364 ; SSE-NEXT: movd %xmm0, (%rsi)
367 ; AVX1-LABEL: shuffle_v8i16_to_v2i16_1:
369 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
370 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
371 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
374 ; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_1:
375 ; AVX2-SLOW: # %bb.0:
376 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
377 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
378 ; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi)
379 ; AVX2-SLOW-NEXT: retq
381 ; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_1:
382 ; AVX2-FAST: # %bb.0:
383 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
384 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
385 ; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi)
386 ; AVX2-FAST-NEXT: retq
388 ; AVX512F-LABEL: shuffle_v8i16_to_v2i16_1:
390 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
391 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
392 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
395 ; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_1:
397 ; AVX512VL-NEXT: vpsrld $16, (%rdi), %xmm0
398 ; AVX512VL-NEXT: vpmovqw %xmm0, (%rsi)
399 ; AVX512VL-NEXT: retq
401 ; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_1:
403 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
404 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
405 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
406 ; AVX512BW-NEXT: retq
408 ; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_1:
409 ; AVX512BWVL: # %bb.0:
410 ; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %xmm0
411 ; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rsi)
412 ; AVX512BWVL-NEXT: retq
413 %vec = load <8 x i16>, <8 x i16>* %L
414 %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 1, i32 5>
415 store <2 x i16> %strided.vec, <2 x i16>* %S
419 define void @shuffle_v8i16_to_v2i16_2(<8 x i16>* %L, <2 x i16>* %S) nounwind {
420 ; SSE-LABEL: shuffle_v8i16_to_v2i16_2:
422 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3]
423 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
424 ; SSE-NEXT: movd %xmm0, (%rsi)
427 ; AVX1-LABEL: shuffle_v8i16_to_v2i16_2:
429 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
430 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
431 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
434 ; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_2:
435 ; AVX2-SLOW: # %bb.0:
436 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
437 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
438 ; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi)
439 ; AVX2-SLOW-NEXT: retq
441 ; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_2:
442 ; AVX2-FAST: # %bb.0:
443 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
444 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
445 ; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi)
446 ; AVX2-FAST-NEXT: retq
448 ; AVX512F-LABEL: shuffle_v8i16_to_v2i16_2:
450 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
451 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
452 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
455 ; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_2:
457 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3]
458 ; AVX512VL-NEXT: vpmovqw %xmm0, (%rsi)
459 ; AVX512VL-NEXT: retq
461 ; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_2:
463 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
464 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
465 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
466 ; AVX512BW-NEXT: retq
468 ; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_2:
469 ; AVX512BWVL: # %bb.0:
470 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3]
471 ; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rsi)
472 ; AVX512BWVL-NEXT: retq
473 %vec = load <8 x i16>, <8 x i16>* %L
474 %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 2, i32 6>
475 store <2 x i16> %strided.vec, <2 x i16>* %S
479 define void @shuffle_v8i16_to_v2i16_3(<8 x i16>* %L, <2 x i16>* %S) nounwind {
480 ; SSE-LABEL: shuffle_v8i16_to_v2i16_3:
482 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3]
483 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
484 ; SSE-NEXT: movd %xmm0, (%rsi)
487 ; AVX1-LABEL: shuffle_v8i16_to_v2i16_3:
489 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
490 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
491 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
494 ; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_3:
495 ; AVX2-SLOW: # %bb.0:
496 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
497 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
498 ; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi)
499 ; AVX2-SLOW-NEXT: retq
501 ; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_3:
502 ; AVX2-FAST: # %bb.0:
503 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
504 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
505 ; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi)
506 ; AVX2-FAST-NEXT: retq
508 ; AVX512F-LABEL: shuffle_v8i16_to_v2i16_3:
510 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
511 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
512 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
515 ; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_3:
517 ; AVX512VL-NEXT: vpsrlq $48, (%rdi), %xmm0
518 ; AVX512VL-NEXT: vpmovqw %xmm0, (%rsi)
519 ; AVX512VL-NEXT: retq
521 ; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_3:
523 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
524 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
525 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
526 ; AVX512BW-NEXT: retq
528 ; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_3:
529 ; AVX512BWVL: # %bb.0:
530 ; AVX512BWVL-NEXT: vpsrlq $48, (%rdi), %xmm0
531 ; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rsi)
532 ; AVX512BWVL-NEXT: retq
533 %vec = load <8 x i16>, <8 x i16>* %L
534 %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 3, i32 7>
535 store <2 x i16> %strided.vec, <2 x i16>* %S
539 define void @shuffle_v16i8_to_v2i8_1(<16 x i8>* %L, <2 x i8>* %S) nounwind {
540 ; SSE2-LABEL: shuffle_v16i8_to_v2i8_1:
542 ; SSE2-NEXT: movdqa (%rdi), %xmm0
543 ; SSE2-NEXT: pxor %xmm1, %xmm1
544 ; SSE2-NEXT: movdqa %xmm0, %xmm2
545 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
546 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
547 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
548 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
549 ; SSE2-NEXT: packuswb %xmm0, %xmm0
550 ; SSE2-NEXT: movd %xmm0, %eax
551 ; SSE2-NEXT: movw %ax, (%rsi)
554 ; SSE42-LABEL: shuffle_v16i8_to_v2i8_1:
556 ; SSE42-NEXT: movdqa (%rdi), %xmm0
557 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
558 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
561 ; AVX-LABEL: shuffle_v16i8_to_v2i8_1:
563 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
564 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
565 ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
568 ; AVX512F-LABEL: shuffle_v16i8_to_v2i8_1:
570 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
571 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
572 ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
575 ; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_1:
577 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
578 ; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0
579 ; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi)
580 ; AVX512VL-NEXT: retq
582 ; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_1:
584 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
585 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
586 ; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
587 ; AVX512BW-NEXT: retq
589 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_1:
590 ; AVX512BWVL: # %bb.0:
591 ; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %xmm0
592 ; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi)
593 ; AVX512BWVL-NEXT: retq
594 %vec = load <16 x i8>, <16 x i8>* %L
595 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 1, i32 9>
596 store <2 x i8> %strided.vec, <2 x i8>* %S
600 define void @shuffle_v16i8_to_v2i8_2(<16 x i8>* %L, <2 x i8>* %S) nounwind {
601 ; SSE2-LABEL: shuffle_v16i8_to_v2i8_2:
603 ; SSE2-NEXT: movdqa (%rdi), %xmm0
604 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
605 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
606 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
607 ; SSE2-NEXT: packuswb %xmm0, %xmm0
608 ; SSE2-NEXT: movd %xmm0, %eax
609 ; SSE2-NEXT: movw %ax, (%rsi)
612 ; SSE42-LABEL: shuffle_v16i8_to_v2i8_2:
614 ; SSE42-NEXT: movdqa (%rdi), %xmm0
615 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
616 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
619 ; AVX-LABEL: shuffle_v16i8_to_v2i8_2:
621 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
622 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
623 ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
626 ; AVX512F-LABEL: shuffle_v16i8_to_v2i8_2:
628 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
629 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
630 ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
633 ; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_2:
635 ; AVX512VL-NEXT: vpsrld $16, (%rdi), %xmm0
636 ; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi)
637 ; AVX512VL-NEXT: retq
639 ; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_2:
641 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
642 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
643 ; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
644 ; AVX512BW-NEXT: retq
646 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_2:
647 ; AVX512BWVL: # %bb.0:
648 ; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %xmm0
649 ; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi)
650 ; AVX512BWVL-NEXT: retq
651 %vec = load <16 x i8>, <16 x i8>* %L
652 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 2, i32 10>
653 store <2 x i8> %strided.vec, <2 x i8>* %S
657 define void @shuffle_v16i8_to_v2i8_3(<16 x i8>* %L, <2 x i8>* %S) nounwind {
658 ; SSE2-LABEL: shuffle_v16i8_to_v2i8_3:
660 ; SSE2-NEXT: movdqa (%rdi), %xmm0
661 ; SSE2-NEXT: pxor %xmm1, %xmm1
662 ; SSE2-NEXT: movdqa %xmm0, %xmm2
663 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
664 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
665 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
666 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
667 ; SSE2-NEXT: packuswb %xmm0, %xmm0
668 ; SSE2-NEXT: movd %xmm0, %eax
669 ; SSE2-NEXT: movw %ax, (%rsi)
672 ; SSE42-LABEL: shuffle_v16i8_to_v2i8_3:
674 ; SSE42-NEXT: movdqa (%rdi), %xmm0
675 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
676 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
679 ; AVX-LABEL: shuffle_v16i8_to_v2i8_3:
681 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
682 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
683 ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
686 ; AVX512F-LABEL: shuffle_v16i8_to_v2i8_3:
688 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
689 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
690 ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
693 ; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_3:
695 ; AVX512VL-NEXT: vpsrld $24, (%rdi), %xmm0
696 ; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi)
697 ; AVX512VL-NEXT: retq
699 ; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_3:
701 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
702 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
703 ; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
704 ; AVX512BW-NEXT: retq
706 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_3:
707 ; AVX512BWVL: # %bb.0:
708 ; AVX512BWVL-NEXT: vpsrld $24, (%rdi), %xmm0
709 ; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi)
710 ; AVX512BWVL-NEXT: retq
711 %vec = load <16 x i8>, <16 x i8>* %L
712 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 3, i32 11>
713 store <2 x i8> %strided.vec, <2 x i8>* %S
717 define void @shuffle_v16i8_to_v2i8_4(<16 x i8>* %L, <2 x i8>* %S) nounwind {
718 ; SSE2-LABEL: shuffle_v16i8_to_v2i8_4:
720 ; SSE2-NEXT: movdqa (%rdi), %xmm0
721 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
722 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
723 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
724 ; SSE2-NEXT: packuswb %xmm0, %xmm0
725 ; SSE2-NEXT: movd %xmm0, %eax
726 ; SSE2-NEXT: movw %ax, (%rsi)
729 ; SSE42-LABEL: shuffle_v16i8_to_v2i8_4:
731 ; SSE42-NEXT: movdqa (%rdi), %xmm0
732 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
733 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
736 ; AVX-LABEL: shuffle_v16i8_to_v2i8_4:
738 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
739 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
740 ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
743 ; AVX512F-LABEL: shuffle_v16i8_to_v2i8_4:
745 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
746 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
747 ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
750 ; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_4:
752 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3]
753 ; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi)
754 ; AVX512VL-NEXT: retq
756 ; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_4:
758 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
759 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
760 ; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
761 ; AVX512BW-NEXT: retq
763 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_4:
764 ; AVX512BWVL: # %bb.0:
765 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3]
766 ; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi)
767 ; AVX512BWVL-NEXT: retq
768 %vec = load <16 x i8>, <16 x i8>* %L
769 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 4, i32 12>
770 store <2 x i8> %strided.vec, <2 x i8>* %S
774 define void @shuffle_v16i8_to_v2i8_5(<16 x i8>* %L, <2 x i8>* %S) nounwind {
775 ; SSE2-LABEL: shuffle_v16i8_to_v2i8_5:
777 ; SSE2-NEXT: movdqa (%rdi), %xmm0
778 ; SSE2-NEXT: pxor %xmm1, %xmm1
779 ; SSE2-NEXT: movdqa %xmm0, %xmm2
780 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
781 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
782 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
783 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
784 ; SSE2-NEXT: packuswb %xmm0, %xmm0
785 ; SSE2-NEXT: movd %xmm0, %eax
786 ; SSE2-NEXT: movw %ax, (%rsi)
789 ; SSE42-LABEL: shuffle_v16i8_to_v2i8_5:
791 ; SSE42-NEXT: movdqa (%rdi), %xmm0
792 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
793 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
796 ; AVX-LABEL: shuffle_v16i8_to_v2i8_5:
798 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
799 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
800 ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
803 ; AVX512F-LABEL: shuffle_v16i8_to_v2i8_5:
805 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
806 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
807 ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
810 ; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_5:
812 ; AVX512VL-NEXT: vpsrlq $40, (%rdi), %xmm0
813 ; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi)
814 ; AVX512VL-NEXT: retq
816 ; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_5:
818 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
819 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
820 ; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
821 ; AVX512BW-NEXT: retq
823 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_5:
824 ; AVX512BWVL: # %bb.0:
825 ; AVX512BWVL-NEXT: vpsrlq $40, (%rdi), %xmm0
826 ; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi)
827 ; AVX512BWVL-NEXT: retq
828 %vec = load <16 x i8>, <16 x i8>* %L
829 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 5, i32 13>
830 store <2 x i8> %strided.vec, <2 x i8>* %S
834 define void @shuffle_v16i8_to_v2i8_6(<16 x i8>* %L, <2 x i8>* %S) nounwind {
835 ; SSE2-LABEL: shuffle_v16i8_to_v2i8_6:
837 ; SSE2-NEXT: movdqa (%rdi), %xmm0
838 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
839 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
840 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
841 ; SSE2-NEXT: packuswb %xmm0, %xmm0
842 ; SSE2-NEXT: movd %xmm0, %eax
843 ; SSE2-NEXT: movw %ax, (%rsi)
846 ; SSE42-LABEL: shuffle_v16i8_to_v2i8_6:
848 ; SSE42-NEXT: movdqa (%rdi), %xmm0
849 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
850 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
853 ; AVX-LABEL: shuffle_v16i8_to_v2i8_6:
855 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
856 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
857 ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
860 ; AVX512F-LABEL: shuffle_v16i8_to_v2i8_6:
862 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
863 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
864 ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
867 ; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_6:
869 ; AVX512VL-NEXT: vpsrlq $48, (%rdi), %xmm0
870 ; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi)
871 ; AVX512VL-NEXT: retq
873 ; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_6:
875 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
876 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
877 ; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
878 ; AVX512BW-NEXT: retq
880 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_6:
881 ; AVX512BWVL: # %bb.0:
882 ; AVX512BWVL-NEXT: vpsrlq $48, (%rdi), %xmm0
883 ; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi)
884 ; AVX512BWVL-NEXT: retq
885 %vec = load <16 x i8>, <16 x i8>* %L
886 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 6, i32 14>
887 store <2 x i8> %strided.vec, <2 x i8>* %S
891 define void @shuffle_v16i8_to_v2i8_7(<16 x i8>* %L, <2 x i8>* %S) nounwind {
892 ; SSE2-LABEL: shuffle_v16i8_to_v2i8_7:
894 ; SSE2-NEXT: movdqa (%rdi), %xmm0
895 ; SSE2-NEXT: pxor %xmm1, %xmm1
896 ; SSE2-NEXT: movdqa %xmm0, %xmm2
897 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
898 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
899 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
900 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
901 ; SSE2-NEXT: packuswb %xmm0, %xmm0
902 ; SSE2-NEXT: movd %xmm0, %eax
903 ; SSE2-NEXT: movw %ax, (%rsi)
906 ; SSE42-LABEL: shuffle_v16i8_to_v2i8_7:
908 ; SSE42-NEXT: movdqa (%rdi), %xmm0
909 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
910 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
913 ; AVX-LABEL: shuffle_v16i8_to_v2i8_7:
915 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
916 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
917 ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
920 ; AVX512F-LABEL: shuffle_v16i8_to_v2i8_7:
922 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
923 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
924 ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
927 ; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_7:
929 ; AVX512VL-NEXT: vpsrlq $56, (%rdi), %xmm0
930 ; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi)
931 ; AVX512VL-NEXT: retq
933 ; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_7:
935 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
936 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
937 ; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
938 ; AVX512BW-NEXT: retq
940 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_7:
941 ; AVX512BWVL: # %bb.0:
942 ; AVX512BWVL-NEXT: vpsrlq $56, (%rdi), %xmm0
943 ; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi)
944 ; AVX512BWVL-NEXT: retq
945 %vec = load <16 x i8>, <16 x i8>* %L
946 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 7, i32 15>
947 store <2 x i8> %strided.vec, <2 x i8>* %S