1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2-SLOW
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
12 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
13 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
14 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
16 define void @shuffle_v16i8_to_v8i8_1(ptr %L, ptr %S) nounwind {
17 ; SSE2-LABEL: shuffle_v16i8_to_v8i8_1:
19 ; SSE2-NEXT: movdqa (%rdi), %xmm0
20 ; SSE2-NEXT: psrlw $8, %xmm0
21 ; SSE2-NEXT: packuswb %xmm0, %xmm0
22 ; SSE2-NEXT: movq %xmm0, (%rsi)
25 ; SSE42-LABEL: shuffle_v16i8_to_v8i8_1:
27 ; SSE42-NEXT: movdqa (%rdi), %xmm0
28 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
29 ; SSE42-NEXT: movq %xmm0, (%rsi)
32 ; AVX-LABEL: shuffle_v16i8_to_v8i8_1:
34 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
35 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
36 ; AVX-NEXT: vmovq %xmm0, (%rsi)
39 ; AVX512-LABEL: shuffle_v16i8_to_v8i8_1:
41 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
42 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
43 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
45 %vec = load <16 x i8>, ptr %L
46 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
47 store <8 x i8> %strided.vec, ptr %S
51 define void @shuffle_v8i16_to_v4i16_1(ptr %L, ptr %S) nounwind {
52 ; SSE2-LABEL: shuffle_v8i16_to_v4i16_1:
54 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[3,1,2,3,4,5,6,7]
55 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
56 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
57 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
58 ; SSE2-NEXT: movq %xmm0, (%rsi)
61 ; SSE42-LABEL: shuffle_v8i16_to_v4i16_1:
63 ; SSE42-NEXT: movdqa (%rdi), %xmm0
64 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
65 ; SSE42-NEXT: movq %xmm0, (%rsi)
68 ; AVX-LABEL: shuffle_v8i16_to_v4i16_1:
70 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
71 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
72 ; AVX-NEXT: vmovq %xmm0, (%rsi)
75 ; AVX512-LABEL: shuffle_v8i16_to_v4i16_1:
77 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
78 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
79 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
81 %vec = load <8 x i16>, ptr %L
82 %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
83 store <4 x i16> %strided.vec, ptr %S
87 define void @shuffle_v4i32_to_v2i32_1(ptr %L, ptr %S) nounwind {
88 ; SSE-LABEL: shuffle_v4i32_to_v2i32_1:
90 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[1,3,2,3]
91 ; SSE-NEXT: movq %xmm0, (%rsi)
94 ; AVX-LABEL: shuffle_v4i32_to_v2i32_1:
96 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3]
97 ; AVX-NEXT: vmovlps %xmm0, (%rsi)
100 ; AVX512-LABEL: shuffle_v4i32_to_v2i32_1:
102 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3]
103 ; AVX512-NEXT: vmovlps %xmm0, (%rsi)
105 %vec = load <4 x i32>, ptr %L
106 %strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
107 store <2 x i32> %strided.vec, ptr %S
111 define void @shuffle_v16i8_to_v4i8_1(ptr %L, ptr %S) nounwind {
112 ; SSE2-LABEL: shuffle_v16i8_to_v4i8_1:
114 ; SSE2-NEXT: movdqa (%rdi), %xmm0
115 ; SSE2-NEXT: pxor %xmm1, %xmm1
116 ; SSE2-NEXT: movdqa %xmm0, %xmm2
117 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
118 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
119 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
120 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
121 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
122 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
123 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
124 ; SSE2-NEXT: packuswb %xmm0, %xmm0
125 ; SSE2-NEXT: movd %xmm0, (%rsi)
128 ; SSE42-LABEL: shuffle_v16i8_to_v4i8_1:
130 ; SSE42-NEXT: movdqa (%rdi), %xmm0
131 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
132 ; SSE42-NEXT: movd %xmm0, (%rsi)
135 ; AVX-LABEL: shuffle_v16i8_to_v4i8_1:
137 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
138 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
139 ; AVX-NEXT: vmovd %xmm0, (%rsi)
142 ; AVX512-LABEL: shuffle_v16i8_to_v4i8_1:
144 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
145 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
146 ; AVX512-NEXT: vmovd %xmm0, (%rsi)
148 %vec = load <16 x i8>, ptr %L
149 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
150 store <4 x i8> %strided.vec, ptr %S
154 define void @shuffle_v16i8_to_v4i8_2(ptr %L, ptr %S) nounwind {
155 ; SSE2-LABEL: shuffle_v16i8_to_v4i8_2:
157 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[3,1,2,3,4,5,6,7]
158 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
159 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
160 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
161 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
162 ; SSE2-NEXT: packuswb %xmm0, %xmm0
163 ; SSE2-NEXT: movd %xmm0, (%rsi)
166 ; SSE42-LABEL: shuffle_v16i8_to_v4i8_2:
168 ; SSE42-NEXT: movdqa (%rdi), %xmm0
169 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
170 ; SSE42-NEXT: movd %xmm0, (%rsi)
173 ; AVX-LABEL: shuffle_v16i8_to_v4i8_2:
175 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
176 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
177 ; AVX-NEXT: vmovd %xmm0, (%rsi)
180 ; AVX512-LABEL: shuffle_v16i8_to_v4i8_2:
182 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
183 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
184 ; AVX512-NEXT: vmovd %xmm0, (%rsi)
186 %vec = load <16 x i8>, ptr %L
187 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
188 store <4 x i8> %strided.vec, ptr %S
192 define void @shuffle_v16i8_to_v4i8_3(ptr %L, ptr %S) nounwind {
193 ; SSE2-LABEL: shuffle_v16i8_to_v4i8_3:
195 ; SSE2-NEXT: movdqa (%rdi), %xmm0
196 ; SSE2-NEXT: pxor %xmm1, %xmm1
197 ; SSE2-NEXT: movdqa %xmm0, %xmm2
198 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
199 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
200 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
201 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
202 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
203 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
204 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
205 ; SSE2-NEXT: packuswb %xmm0, %xmm0
206 ; SSE2-NEXT: movd %xmm0, (%rsi)
209 ; SSE42-LABEL: shuffle_v16i8_to_v4i8_3:
211 ; SSE42-NEXT: movdqa (%rdi), %xmm0
212 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u]
213 ; SSE42-NEXT: movd %xmm0, (%rsi)
216 ; AVX-LABEL: shuffle_v16i8_to_v4i8_3:
218 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
219 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u]
220 ; AVX-NEXT: vmovd %xmm0, (%rsi)
223 ; AVX512-LABEL: shuffle_v16i8_to_v4i8_3:
225 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
226 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u]
227 ; AVX512-NEXT: vmovd %xmm0, (%rsi)
229 %vec = load <16 x i8>, ptr %L
230 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
231 store <4 x i8> %strided.vec, ptr %S
235 define void @shuffle_v8i16_to_v2i16_1(ptr %L, ptr %S) nounwind {
236 ; SSE-LABEL: shuffle_v8i16_to_v2i16_1:
238 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
239 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
240 ; SSE-NEXT: movd %xmm0, (%rsi)
243 ; AVX1-LABEL: shuffle_v8i16_to_v2i16_1:
245 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
246 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
247 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
250 ; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_1:
251 ; AVX2-SLOW: # %bb.0:
252 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
253 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
254 ; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi)
255 ; AVX2-SLOW-NEXT: retq
257 ; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_1:
258 ; AVX2-FAST: # %bb.0:
259 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
260 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
261 ; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi)
262 ; AVX2-FAST-NEXT: retq
264 ; AVX512F-LABEL: shuffle_v8i16_to_v2i16_1:
266 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
267 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
268 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
271 ; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_1:
273 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
274 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
275 ; AVX512VL-NEXT: vmovd %xmm0, (%rsi)
276 ; AVX512VL-NEXT: retq
278 ; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_1:
280 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
281 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
282 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
283 ; AVX512BW-NEXT: retq
285 ; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_1:
286 ; AVX512BWVL: # %bb.0:
287 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
288 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
289 ; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi)
290 ; AVX512BWVL-NEXT: retq
291 %vec = load <8 x i16>, ptr %L
292 %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 1, i32 5>
293 store <2 x i16> %strided.vec, ptr %S
297 define void @shuffle_v8i16_to_v2i16_2(ptr %L, ptr %S) nounwind {
298 ; SSE-LABEL: shuffle_v8i16_to_v2i16_2:
300 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3]
301 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
302 ; SSE-NEXT: movd %xmm0, (%rsi)
305 ; AVX1-LABEL: shuffle_v8i16_to_v2i16_2:
307 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
308 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
309 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
312 ; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_2:
313 ; AVX2-SLOW: # %bb.0:
314 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
315 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
316 ; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi)
317 ; AVX2-SLOW-NEXT: retq
319 ; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_2:
320 ; AVX2-FAST: # %bb.0:
321 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
322 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
323 ; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi)
324 ; AVX2-FAST-NEXT: retq
326 ; AVX512F-LABEL: shuffle_v8i16_to_v2i16_2:
328 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
329 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
330 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
333 ; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_2:
335 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
336 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
337 ; AVX512VL-NEXT: vmovd %xmm0, (%rsi)
338 ; AVX512VL-NEXT: retq
340 ; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_2:
342 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
343 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
344 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
345 ; AVX512BW-NEXT: retq
347 ; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_2:
348 ; AVX512BWVL: # %bb.0:
349 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
350 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
351 ; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi)
352 ; AVX512BWVL-NEXT: retq
353 %vec = load <8 x i16>, ptr %L
354 %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 2, i32 6>
355 store <2 x i16> %strided.vec, ptr %S
359 define void @shuffle_v8i16_to_v2i16_3(ptr %L, ptr %S) nounwind {
360 ; SSE-LABEL: shuffle_v8i16_to_v2i16_3:
362 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3]
363 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
364 ; SSE-NEXT: movd %xmm0, (%rsi)
367 ; AVX1-LABEL: shuffle_v8i16_to_v2i16_3:
369 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
370 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
371 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
374 ; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_3:
375 ; AVX2-SLOW: # %bb.0:
376 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
377 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
378 ; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi)
379 ; AVX2-SLOW-NEXT: retq
381 ; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_3:
382 ; AVX2-FAST: # %bb.0:
383 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
384 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
385 ; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi)
386 ; AVX2-FAST-NEXT: retq
388 ; AVX512F-LABEL: shuffle_v8i16_to_v2i16_3:
390 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
391 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
392 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
395 ; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_3:
397 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
398 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
399 ; AVX512VL-NEXT: vmovd %xmm0, (%rsi)
400 ; AVX512VL-NEXT: retq
402 ; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_3:
404 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
405 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
406 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
407 ; AVX512BW-NEXT: retq
409 ; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_3:
410 ; AVX512BWVL: # %bb.0:
411 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
412 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
413 ; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi)
414 ; AVX512BWVL-NEXT: retq
415 %vec = load <8 x i16>, ptr %L
416 %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 3, i32 7>
417 store <2 x i16> %strided.vec, ptr %S
421 define void @shuffle_v16i8_to_v2i8_1(ptr %L, ptr %S) nounwind {
422 ; SSE2-LABEL: shuffle_v16i8_to_v2i8_1:
424 ; SSE2-NEXT: movdqa (%rdi), %xmm0
425 ; SSE2-NEXT: pxor %xmm1, %xmm1
426 ; SSE2-NEXT: movdqa %xmm0, %xmm2
427 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
428 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
429 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
430 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
431 ; SSE2-NEXT: packuswb %xmm0, %xmm0
432 ; SSE2-NEXT: movd %xmm0, %eax
433 ; SSE2-NEXT: movw %ax, (%rsi)
436 ; SSE42-LABEL: shuffle_v16i8_to_v2i8_1:
438 ; SSE42-NEXT: movdqa (%rdi), %xmm0
439 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
440 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
443 ; AVX-LABEL: shuffle_v16i8_to_v2i8_1:
445 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
446 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
447 ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
450 ; AVX512-LABEL: shuffle_v16i8_to_v2i8_1:
452 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
453 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
454 ; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi)
456 %vec = load <16 x i8>, ptr %L
457 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 1, i32 9>
458 store <2 x i8> %strided.vec, ptr %S
462 define void @shuffle_v16i8_to_v2i8_2(ptr %L, ptr %S) nounwind {
463 ; SSE2-LABEL: shuffle_v16i8_to_v2i8_2:
465 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
466 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
467 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
468 ; SSE2-NEXT: packuswb %xmm0, %xmm0
469 ; SSE2-NEXT: movd %xmm0, %eax
470 ; SSE2-NEXT: movw %ax, (%rsi)
473 ; SSE42-LABEL: shuffle_v16i8_to_v2i8_2:
475 ; SSE42-NEXT: movdqa (%rdi), %xmm0
476 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
477 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
480 ; AVX-LABEL: shuffle_v16i8_to_v2i8_2:
482 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
483 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
484 ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
487 ; AVX512-LABEL: shuffle_v16i8_to_v2i8_2:
489 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
490 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
491 ; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi)
493 %vec = load <16 x i8>, ptr %L
494 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 2, i32 10>
495 store <2 x i8> %strided.vec, ptr %S
499 define void @shuffle_v16i8_to_v2i8_3(ptr %L, ptr %S) nounwind {
500 ; SSE2-LABEL: shuffle_v16i8_to_v2i8_3:
502 ; SSE2-NEXT: movdqa (%rdi), %xmm0
503 ; SSE2-NEXT: pxor %xmm1, %xmm1
504 ; SSE2-NEXT: movdqa %xmm0, %xmm2
505 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
506 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
507 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
508 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
509 ; SSE2-NEXT: packuswb %xmm0, %xmm0
510 ; SSE2-NEXT: movd %xmm0, %eax
511 ; SSE2-NEXT: movw %ax, (%rsi)
514 ; SSE42-LABEL: shuffle_v16i8_to_v2i8_3:
516 ; SSE42-NEXT: movdqa (%rdi), %xmm0
517 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
518 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
521 ; AVX-LABEL: shuffle_v16i8_to_v2i8_3:
523 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
524 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
525 ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
528 ; AVX512-LABEL: shuffle_v16i8_to_v2i8_3:
530 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
531 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
532 ; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi)
534 %vec = load <16 x i8>, ptr %L
535 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 3, i32 11>
536 store <2 x i8> %strided.vec, ptr %S
540 define void @shuffle_v16i8_to_v2i8_4(ptr %L, ptr %S) nounwind {
541 ; SSE2-LABEL: shuffle_v16i8_to_v2i8_4:
543 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3]
544 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
545 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
546 ; SSE2-NEXT: packuswb %xmm0, %xmm0
547 ; SSE2-NEXT: movd %xmm0, %eax
548 ; SSE2-NEXT: movw %ax, (%rsi)
551 ; SSE42-LABEL: shuffle_v16i8_to_v2i8_4:
553 ; SSE42-NEXT: movdqa (%rdi), %xmm0
554 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
555 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
558 ; AVX-LABEL: shuffle_v16i8_to_v2i8_4:
560 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
561 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
562 ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
565 ; AVX512-LABEL: shuffle_v16i8_to_v2i8_4:
567 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
568 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
569 ; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi)
571 %vec = load <16 x i8>, ptr %L
572 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 4, i32 12>
573 store <2 x i8> %strided.vec, ptr %S
577 define void @shuffle_v16i8_to_v2i8_5(ptr %L, ptr %S) nounwind {
578 ; SSE2-LABEL: shuffle_v16i8_to_v2i8_5:
580 ; SSE2-NEXT: movdqa (%rdi), %xmm0
581 ; SSE2-NEXT: pxor %xmm1, %xmm1
582 ; SSE2-NEXT: movdqa %xmm0, %xmm2
583 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
584 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
585 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
586 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
587 ; SSE2-NEXT: packuswb %xmm0, %xmm0
588 ; SSE2-NEXT: movd %xmm0, %eax
589 ; SSE2-NEXT: movw %ax, (%rsi)
592 ; SSE42-LABEL: shuffle_v16i8_to_v2i8_5:
594 ; SSE42-NEXT: movdqa (%rdi), %xmm0
595 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
596 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
599 ; AVX-LABEL: shuffle_v16i8_to_v2i8_5:
601 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
602 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
603 ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
606 ; AVX512-LABEL: shuffle_v16i8_to_v2i8_5:
608 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
609 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
610 ; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi)
612 %vec = load <16 x i8>, ptr %L
613 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 5, i32 13>
614 store <2 x i8> %strided.vec, ptr %S
618 define void @shuffle_v16i8_to_v2i8_6(ptr %L, ptr %S) nounwind {
619 ; SSE2-LABEL: shuffle_v16i8_to_v2i8_6:
621 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3]
622 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
623 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
624 ; SSE2-NEXT: packuswb %xmm0, %xmm0
625 ; SSE2-NEXT: movd %xmm0, %eax
626 ; SSE2-NEXT: movw %ax, (%rsi)
629 ; SSE42-LABEL: shuffle_v16i8_to_v2i8_6:
631 ; SSE42-NEXT: movdqa (%rdi), %xmm0
632 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
633 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
636 ; AVX-LABEL: shuffle_v16i8_to_v2i8_6:
638 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
639 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
640 ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
643 ; AVX512-LABEL: shuffle_v16i8_to_v2i8_6:
645 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
646 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
647 ; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi)
649 %vec = load <16 x i8>, ptr %L
650 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 6, i32 14>
651 store <2 x i8> %strided.vec, ptr %S
655 define void @shuffle_v16i8_to_v2i8_7(ptr %L, ptr %S) nounwind {
656 ; SSE2-LABEL: shuffle_v16i8_to_v2i8_7:
658 ; SSE2-NEXT: movdqa (%rdi), %xmm0
659 ; SSE2-NEXT: pxor %xmm1, %xmm1
660 ; SSE2-NEXT: movdqa %xmm0, %xmm2
661 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
662 ; SSE2-NEXT: psrlw $8, %xmm0
663 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
664 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3]
665 ; SSE2-NEXT: packuswb %xmm0, %xmm0
666 ; SSE2-NEXT: movd %xmm0, %eax
667 ; SSE2-NEXT: movw %ax, (%rsi)
670 ; SSE42-LABEL: shuffle_v16i8_to_v2i8_7:
672 ; SSE42-NEXT: movdqa (%rdi), %xmm0
673 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
674 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
677 ; AVX-LABEL: shuffle_v16i8_to_v2i8_7:
679 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
680 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
681 ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
684 ; AVX512-LABEL: shuffle_v16i8_to_v2i8_7:
686 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
687 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
688 ; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi)
690 %vec = load <16 x i8>, ptr %L
691 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 7, i32 15>
692 store <2 x i8> %strided.vec, ptr %S