1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2-SLOW
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
12 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
13 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
14 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
16 define void @shuffle_v16i8_to_v8i8_1(<16 x i8>* %L, <8 x i8>* %S) nounwind {
17 ; SSE2-LABEL: shuffle_v16i8_to_v8i8_1:
19 ; SSE2-NEXT: movdqa (%rdi), %xmm0
20 ; SSE2-NEXT: psrlw $8, %xmm0
21 ; SSE2-NEXT: packuswb %xmm0, %xmm0
22 ; SSE2-NEXT: movq %xmm0, (%rsi)
25 ; SSE42-LABEL: shuffle_v16i8_to_v8i8_1:
27 ; SSE42-NEXT: movdqa (%rdi), %xmm0
28 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
29 ; SSE42-NEXT: movq %xmm0, (%rsi)
32 ; AVX-LABEL: shuffle_v16i8_to_v8i8_1:
34 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
35 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
36 ; AVX-NEXT: vmovq %xmm0, (%rsi)
39 ; AVX512-LABEL: shuffle_v16i8_to_v8i8_1:
41 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
42 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
43 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
45 %vec = load <16 x i8>, <16 x i8>* %L
46 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
47 store <8 x i8> %strided.vec, <8 x i8>* %S
51 define void @shuffle_v8i16_to_v4i16_1(<8 x i16>* %L, <4 x i16>* %S) nounwind {
52 ; SSE2-LABEL: shuffle_v8i16_to_v4i16_1:
54 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[3,1,2,3,4,5,6,7]
55 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
56 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
57 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
58 ; SSE2-NEXT: movq %xmm0, (%rsi)
61 ; SSE42-LABEL: shuffle_v8i16_to_v4i16_1:
63 ; SSE42-NEXT: movdqa (%rdi), %xmm0
64 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
65 ; SSE42-NEXT: movq %xmm0, (%rsi)
68 ; AVX-LABEL: shuffle_v8i16_to_v4i16_1:
70 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
71 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
72 ; AVX-NEXT: vmovq %xmm0, (%rsi)
75 ; AVX512-LABEL: shuffle_v8i16_to_v4i16_1:
77 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
78 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
79 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
81 %vec = load <8 x i16>, <8 x i16>* %L
82 %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
83 store <4 x i16> %strided.vec, <4 x i16>* %S
87 define void @shuffle_v4i32_to_v2i32_1(<4 x i32>* %L, <2 x i32>* %S) nounwind {
88 ; SSE-LABEL: shuffle_v4i32_to_v2i32_1:
90 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[1,3,2,3]
91 ; SSE-NEXT: movq %xmm0, (%rsi)
94 ; AVX-LABEL: shuffle_v4i32_to_v2i32_1:
96 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3]
97 ; AVX-NEXT: vmovlps %xmm0, (%rsi)
100 ; AVX512-LABEL: shuffle_v4i32_to_v2i32_1:
102 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3]
103 ; AVX512-NEXT: vmovlps %xmm0, (%rsi)
105 %vec = load <4 x i32>, <4 x i32>* %L
106 %strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
107 store <2 x i32> %strided.vec, <2 x i32>* %S
111 define void @shuffle_v16i8_to_v4i8_1(<16 x i8>* %L, <4 x i8>* %S) nounwind {
112 ; SSE2-LABEL: shuffle_v16i8_to_v4i8_1:
114 ; SSE2-NEXT: movdqa (%rdi), %xmm0
115 ; SSE2-NEXT: pxor %xmm1, %xmm1
116 ; SSE2-NEXT: movdqa %xmm0, %xmm2
117 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
118 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
119 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
120 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
121 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
122 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
123 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
124 ; SSE2-NEXT: packuswb %xmm0, %xmm0
125 ; SSE2-NEXT: movd %xmm0, (%rsi)
128 ; SSE42-LABEL: shuffle_v16i8_to_v4i8_1:
130 ; SSE42-NEXT: movdqa (%rdi), %xmm0
131 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
132 ; SSE42-NEXT: movd %xmm0, (%rsi)
135 ; AVX-LABEL: shuffle_v16i8_to_v4i8_1:
137 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
138 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
139 ; AVX-NEXT: vmovd %xmm0, (%rsi)
142 ; AVX512-LABEL: shuffle_v16i8_to_v4i8_1:
144 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
145 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
146 ; AVX512-NEXT: vmovd %xmm0, (%rsi)
148 %vec = load <16 x i8>, <16 x i8>* %L
149 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
150 store <4 x i8> %strided.vec, <4 x i8>* %S
154 define void @shuffle_v16i8_to_v4i8_2(<16 x i8>* %L, <4 x i8>* %S) nounwind {
155 ; SSE2-LABEL: shuffle_v16i8_to_v4i8_2:
157 ; SSE2-NEXT: movdqa (%rdi), %xmm0
158 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
159 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
160 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
161 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
162 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
163 ; SSE2-NEXT: packuswb %xmm0, %xmm0
164 ; SSE2-NEXT: movd %xmm0, (%rsi)
167 ; SSE42-LABEL: shuffle_v16i8_to_v4i8_2:
169 ; SSE42-NEXT: movdqa (%rdi), %xmm0
170 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
171 ; SSE42-NEXT: movd %xmm0, (%rsi)
174 ; AVX-LABEL: shuffle_v16i8_to_v4i8_2:
176 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
177 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
178 ; AVX-NEXT: vmovd %xmm0, (%rsi)
181 ; AVX512-LABEL: shuffle_v16i8_to_v4i8_2:
183 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
184 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
185 ; AVX512-NEXT: vmovd %xmm0, (%rsi)
187 %vec = load <16 x i8>, <16 x i8>* %L
188 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
189 store <4 x i8> %strided.vec, <4 x i8>* %S
193 define void @shuffle_v16i8_to_v4i8_3(<16 x i8>* %L, <4 x i8>* %S) nounwind {
194 ; SSE2-LABEL: shuffle_v16i8_to_v4i8_3:
196 ; SSE2-NEXT: movdqa (%rdi), %xmm0
197 ; SSE2-NEXT: pxor %xmm1, %xmm1
198 ; SSE2-NEXT: movdqa %xmm0, %xmm2
199 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
200 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
201 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
202 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
203 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
204 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
205 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
206 ; SSE2-NEXT: packuswb %xmm0, %xmm0
207 ; SSE2-NEXT: movd %xmm0, (%rsi)
210 ; SSE42-LABEL: shuffle_v16i8_to_v4i8_3:
212 ; SSE42-NEXT: movdqa (%rdi), %xmm0
213 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u]
214 ; SSE42-NEXT: movd %xmm0, (%rsi)
217 ; AVX-LABEL: shuffle_v16i8_to_v4i8_3:
219 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
220 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u]
221 ; AVX-NEXT: vmovd %xmm0, (%rsi)
224 ; AVX512-LABEL: shuffle_v16i8_to_v4i8_3:
226 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
227 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u]
228 ; AVX512-NEXT: vmovd %xmm0, (%rsi)
230 %vec = load <16 x i8>, <16 x i8>* %L
231 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
232 store <4 x i8> %strided.vec, <4 x i8>* %S
236 define void @shuffle_v8i16_to_v2i16_1(<8 x i16>* %L, <2 x i16>* %S) nounwind {
237 ; SSE-LABEL: shuffle_v8i16_to_v2i16_1:
239 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
240 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
241 ; SSE-NEXT: movd %xmm0, (%rsi)
244 ; AVX1-LABEL: shuffle_v8i16_to_v2i16_1:
246 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
247 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
248 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
251 ; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_1:
252 ; AVX2-SLOW: # %bb.0:
253 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
254 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
255 ; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi)
256 ; AVX2-SLOW-NEXT: retq
258 ; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_1:
259 ; AVX2-FAST: # %bb.0:
260 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
261 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
262 ; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi)
263 ; AVX2-FAST-NEXT: retq
265 ; AVX512F-LABEL: shuffle_v8i16_to_v2i16_1:
267 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
268 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
269 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
272 ; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_1:
274 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
275 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
276 ; AVX512VL-NEXT: vmovd %xmm0, (%rsi)
277 ; AVX512VL-NEXT: retq
279 ; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_1:
281 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
282 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
283 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
284 ; AVX512BW-NEXT: retq
286 ; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_1:
287 ; AVX512BWVL: # %bb.0:
288 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
289 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
290 ; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi)
291 ; AVX512BWVL-NEXT: retq
292 %vec = load <8 x i16>, <8 x i16>* %L
293 %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 1, i32 5>
294 store <2 x i16> %strided.vec, <2 x i16>* %S
298 define void @shuffle_v8i16_to_v2i16_2(<8 x i16>* %L, <2 x i16>* %S) nounwind {
299 ; SSE-LABEL: shuffle_v8i16_to_v2i16_2:
301 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3]
302 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
303 ; SSE-NEXT: movd %xmm0, (%rsi)
306 ; AVX1-LABEL: shuffle_v8i16_to_v2i16_2:
308 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
309 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
310 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
313 ; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_2:
314 ; AVX2-SLOW: # %bb.0:
315 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
316 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
317 ; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi)
318 ; AVX2-SLOW-NEXT: retq
320 ; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_2:
321 ; AVX2-FAST: # %bb.0:
322 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
323 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
324 ; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi)
325 ; AVX2-FAST-NEXT: retq
327 ; AVX512F-LABEL: shuffle_v8i16_to_v2i16_2:
329 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
330 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
331 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
334 ; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_2:
336 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
337 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
338 ; AVX512VL-NEXT: vmovd %xmm0, (%rsi)
339 ; AVX512VL-NEXT: retq
341 ; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_2:
343 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
344 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
345 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
346 ; AVX512BW-NEXT: retq
348 ; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_2:
349 ; AVX512BWVL: # %bb.0:
350 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
351 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
352 ; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi)
353 ; AVX512BWVL-NEXT: retq
354 %vec = load <8 x i16>, <8 x i16>* %L
355 %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 2, i32 6>
356 store <2 x i16> %strided.vec, <2 x i16>* %S
360 define void @shuffle_v8i16_to_v2i16_3(<8 x i16>* %L, <2 x i16>* %S) nounwind {
361 ; SSE-LABEL: shuffle_v8i16_to_v2i16_3:
363 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3]
364 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
365 ; SSE-NEXT: movd %xmm0, (%rsi)
368 ; AVX1-LABEL: shuffle_v8i16_to_v2i16_3:
370 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
371 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
372 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
375 ; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_3:
376 ; AVX2-SLOW: # %bb.0:
377 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
378 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
379 ; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi)
380 ; AVX2-SLOW-NEXT: retq
382 ; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_3:
383 ; AVX2-FAST: # %bb.0:
384 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
385 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
386 ; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi)
387 ; AVX2-FAST-NEXT: retq
389 ; AVX512F-LABEL: shuffle_v8i16_to_v2i16_3:
391 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
392 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
393 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
396 ; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_3:
398 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
399 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
400 ; AVX512VL-NEXT: vmovd %xmm0, (%rsi)
401 ; AVX512VL-NEXT: retq
403 ; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_3:
405 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
406 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
407 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
408 ; AVX512BW-NEXT: retq
410 ; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_3:
411 ; AVX512BWVL: # %bb.0:
412 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
413 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
414 ; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi)
415 ; AVX512BWVL-NEXT: retq
416 %vec = load <8 x i16>, <8 x i16>* %L
417 %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 3, i32 7>
418 store <2 x i16> %strided.vec, <2 x i16>* %S
422 define void @shuffle_v16i8_to_v2i8_1(<16 x i8>* %L, <2 x i8>* %S) nounwind {
423 ; SSE2-LABEL: shuffle_v16i8_to_v2i8_1:
425 ; SSE2-NEXT: movdqa (%rdi), %xmm0
426 ; SSE2-NEXT: pxor %xmm1, %xmm1
427 ; SSE2-NEXT: movdqa %xmm0, %xmm2
428 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
429 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
430 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
431 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
432 ; SSE2-NEXT: packuswb %xmm0, %xmm0
433 ; SSE2-NEXT: movd %xmm0, %eax
434 ; SSE2-NEXT: movw %ax, (%rsi)
437 ; SSE42-LABEL: shuffle_v16i8_to_v2i8_1:
439 ; SSE42-NEXT: movdqa (%rdi), %xmm0
440 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
441 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
444 ; AVX-LABEL: shuffle_v16i8_to_v2i8_1:
446 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
447 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
448 ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
451 ; AVX512-LABEL: shuffle_v16i8_to_v2i8_1:
453 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
454 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
455 ; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi)
457 %vec = load <16 x i8>, <16 x i8>* %L
458 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 1, i32 9>
459 store <2 x i8> %strided.vec, <2 x i8>* %S
463 define void @shuffle_v16i8_to_v2i8_2(<16 x i8>* %L, <2 x i8>* %S) nounwind {
464 ; SSE2-LABEL: shuffle_v16i8_to_v2i8_2:
466 ; SSE2-NEXT: movdqa (%rdi), %xmm0
467 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
468 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
469 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
470 ; SSE2-NEXT: packuswb %xmm0, %xmm0
471 ; SSE2-NEXT: movd %xmm0, %eax
472 ; SSE2-NEXT: movw %ax, (%rsi)
475 ; SSE42-LABEL: shuffle_v16i8_to_v2i8_2:
477 ; SSE42-NEXT: movdqa (%rdi), %xmm0
478 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
479 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
482 ; AVX-LABEL: shuffle_v16i8_to_v2i8_2:
484 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
485 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
486 ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
489 ; AVX512-LABEL: shuffle_v16i8_to_v2i8_2:
491 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
492 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
493 ; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi)
495 %vec = load <16 x i8>, <16 x i8>* %L
496 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 2, i32 10>
497 store <2 x i8> %strided.vec, <2 x i8>* %S
501 define void @shuffle_v16i8_to_v2i8_3(<16 x i8>* %L, <2 x i8>* %S) nounwind {
502 ; SSE2-LABEL: shuffle_v16i8_to_v2i8_3:
504 ; SSE2-NEXT: movdqa (%rdi), %xmm0
505 ; SSE2-NEXT: pxor %xmm1, %xmm1
506 ; SSE2-NEXT: movdqa %xmm0, %xmm2
507 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
508 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
509 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
510 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
511 ; SSE2-NEXT: packuswb %xmm0, %xmm0
512 ; SSE2-NEXT: movd %xmm0, %eax
513 ; SSE2-NEXT: movw %ax, (%rsi)
516 ; SSE42-LABEL: shuffle_v16i8_to_v2i8_3:
518 ; SSE42-NEXT: movdqa (%rdi), %xmm0
519 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
520 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
523 ; AVX-LABEL: shuffle_v16i8_to_v2i8_3:
525 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
526 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
527 ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
530 ; AVX512-LABEL: shuffle_v16i8_to_v2i8_3:
532 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
533 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
534 ; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi)
536 %vec = load <16 x i8>, <16 x i8>* %L
537 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 3, i32 11>
538 store <2 x i8> %strided.vec, <2 x i8>* %S
542 define void @shuffle_v16i8_to_v2i8_4(<16 x i8>* %L, <2 x i8>* %S) nounwind {
543 ; SSE2-LABEL: shuffle_v16i8_to_v2i8_4:
545 ; SSE2-NEXT: movdqa (%rdi), %xmm0
546 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
547 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
548 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
549 ; SSE2-NEXT: packuswb %xmm0, %xmm0
550 ; SSE2-NEXT: movd %xmm0, %eax
551 ; SSE2-NEXT: movw %ax, (%rsi)
554 ; SSE42-LABEL: shuffle_v16i8_to_v2i8_4:
556 ; SSE42-NEXT: movdqa (%rdi), %xmm0
557 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
558 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
561 ; AVX-LABEL: shuffle_v16i8_to_v2i8_4:
563 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
564 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
565 ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
568 ; AVX512-LABEL: shuffle_v16i8_to_v2i8_4:
570 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
571 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
572 ; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi)
574 %vec = load <16 x i8>, <16 x i8>* %L
575 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 4, i32 12>
576 store <2 x i8> %strided.vec, <2 x i8>* %S
580 define void @shuffle_v16i8_to_v2i8_5(<16 x i8>* %L, <2 x i8>* %S) nounwind {
581 ; SSE2-LABEL: shuffle_v16i8_to_v2i8_5:
583 ; SSE2-NEXT: movdqa (%rdi), %xmm0
584 ; SSE2-NEXT: pxor %xmm1, %xmm1
585 ; SSE2-NEXT: movdqa %xmm0, %xmm2
586 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
587 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
588 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
589 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
590 ; SSE2-NEXT: packuswb %xmm0, %xmm0
591 ; SSE2-NEXT: movd %xmm0, %eax
592 ; SSE2-NEXT: movw %ax, (%rsi)
595 ; SSE42-LABEL: shuffle_v16i8_to_v2i8_5:
597 ; SSE42-NEXT: movdqa (%rdi), %xmm0
598 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
599 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
602 ; AVX-LABEL: shuffle_v16i8_to_v2i8_5:
604 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
605 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
606 ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
609 ; AVX512-LABEL: shuffle_v16i8_to_v2i8_5:
611 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
612 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
613 ; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi)
615 %vec = load <16 x i8>, <16 x i8>* %L
616 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 5, i32 13>
617 store <2 x i8> %strided.vec, <2 x i8>* %S
621 define void @shuffle_v16i8_to_v2i8_6(<16 x i8>* %L, <2 x i8>* %S) nounwind {
622 ; SSE2-LABEL: shuffle_v16i8_to_v2i8_6:
624 ; SSE2-NEXT: movdqa (%rdi), %xmm0
625 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
626 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
627 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
628 ; SSE2-NEXT: packuswb %xmm0, %xmm0
629 ; SSE2-NEXT: movd %xmm0, %eax
630 ; SSE2-NEXT: movw %ax, (%rsi)
633 ; SSE42-LABEL: shuffle_v16i8_to_v2i8_6:
635 ; SSE42-NEXT: movdqa (%rdi), %xmm0
636 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
637 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
640 ; AVX-LABEL: shuffle_v16i8_to_v2i8_6:
642 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
643 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
644 ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
647 ; AVX512-LABEL: shuffle_v16i8_to_v2i8_6:
649 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
650 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
651 ; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi)
653 %vec = load <16 x i8>, <16 x i8>* %L
654 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 6, i32 14>
655 store <2 x i8> %strided.vec, <2 x i8>* %S
659 define void @shuffle_v16i8_to_v2i8_7(<16 x i8>* %L, <2 x i8>* %S) nounwind {
660 ; SSE2-LABEL: shuffle_v16i8_to_v2i8_7:
662 ; SSE2-NEXT: movdqa (%rdi), %xmm0
663 ; SSE2-NEXT: pxor %xmm1, %xmm1
664 ; SSE2-NEXT: movdqa %xmm0, %xmm2
665 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
666 ; SSE2-NEXT: psrlw $8, %xmm0
667 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
668 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3]
669 ; SSE2-NEXT: packuswb %xmm0, %xmm0
670 ; SSE2-NEXT: movd %xmm0, %eax
671 ; SSE2-NEXT: movw %ax, (%rsi)
674 ; SSE42-LABEL: shuffle_v16i8_to_v2i8_7:
676 ; SSE42-NEXT: movdqa (%rdi), %xmm0
677 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
678 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
681 ; AVX-LABEL: shuffle_v16i8_to_v2i8_7:
683 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
684 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
685 ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
688 ; AVX512-LABEL: shuffle_v16i8_to_v2i8_7:
690 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
691 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
692 ; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi)
694 %vec = load <16 x i8>, <16 x i8>* %L
695 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 7, i32 15>
696 store <2 x i8> %strided.vec, <2 x i8>* %S