1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
4 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
6 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST,AVX2-FAST-ALL
7 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST,AVX2-FAST-PERLANE
8 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+xop | FileCheck %s --check-prefixes=AVX,XOP
10 define void @v3i64(<2 x i64> %a, <2 x i64> %b, ptr %p) nounwind {
13 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
14 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
15 ; SSE2-NEXT: movq %xmm2, 16(%rdi)
16 ; SSE2-NEXT: movdqa %xmm0, (%rdi)
21 ; SSE42-NEXT: pextrq $1, %xmm0, 16(%rdi)
22 ; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
23 ; SSE42-NEXT: movdqa %xmm0, (%rdi)
28 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm0[0],xmm1[0]
29 ; AVX-NEXT: vpextrq $1, %xmm0, 16(%rdi)
30 ; AVX-NEXT: vmovdqa %xmm1, (%rdi)
32 %r = shufflevector <2 x i64> %a, <2 x i64> %b, <3 x i32> <i32 0, i32 2, i32 1>
33 store <3 x i64> %r, ptr %p
36 define void @v3f64(<2 x double> %a, <2 x double> %b, ptr %p) nounwind {
39 ; SSE-NEXT: movhps %xmm0, 16(%rdi)
40 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
41 ; SSE-NEXT: movaps %xmm0, (%rdi)
46 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm1[0]
47 ; AVX-NEXT: vmovhps %xmm0, 16(%rdi)
48 ; AVX-NEXT: vmovaps %xmm1, (%rdi)
50 %r = shufflevector <2 x double> %a, <2 x double> %b, <3 x i32> <i32 0, i32 2, i32 1>
51 store <3 x double> %r, ptr %p
55 define void @v3i32(<2 x i32> %a, <2 x i32> %b, ptr %p) nounwind {
58 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
59 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
60 ; SSE2-NEXT: movd %xmm2, 8(%rdi)
61 ; SSE2-NEXT: movq %xmm0, (%rdi)
66 ; SSE42-NEXT: extractps $1, %xmm0, 8(%rdi)
67 ; SSE42-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
68 ; SSE42-NEXT: movlps %xmm0, (%rdi)
73 ; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
74 ; AVX-NEXT: vextractps $1, %xmm0, 8(%rdi)
75 ; AVX-NEXT: vmovlps %xmm1, (%rdi)
77 %r = shufflevector <2 x i32> %a, <2 x i32> %b, <3 x i32> <i32 0, i32 2, i32 1>
78 store <3 x i32> %r, ptr %p
82 define void @v5i16(<4 x i16> %a, <4 x i16> %b, ptr %p) nounwind {
85 ; SSE2-NEXT: psrlq $16, %xmm1
86 ; SSE2-NEXT: pextrw $3, %xmm0, %eax
87 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
88 ; SSE2-NEXT: movw %ax, 8(%rdi)
89 ; SSE2-NEXT: movq %xmm0, (%rdi)
94 ; SSE42-NEXT: psrlq $16, %xmm1
95 ; SSE42-NEXT: pextrw $3, %xmm0, 8(%rdi)
96 ; SSE42-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
97 ; SSE42-NEXT: movq %xmm0, (%rdi)
102 ; AVX-NEXT: vpsrlq $16, %xmm1, %xmm1
103 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
104 ; AVX-NEXT: vpextrw $3, %xmm0, 8(%rdi)
105 ; AVX-NEXT: vmovq %xmm1, (%rdi)
107 %r = shufflevector <4 x i16> %a, <4 x i16> %b, <5 x i32> <i32 0, i32 5, i32 1, i32 6, i32 3>
108 store <5 x i16> %r, ptr %p
112 define void @v5i32(<4 x i32> %a, <4 x i32> %b, ptr %p) nounwind {
115 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,2,2,3]
116 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
117 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
118 ; SSE2-NEXT: movd %xmm2, 16(%rdi)
119 ; SSE2-NEXT: movdqa %xmm0, (%rdi)
122 ; SSE42-LABEL: v5i32:
124 ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,2,2,3]
125 ; SSE42-NEXT: pextrd $3, %xmm0, 16(%rdi)
126 ; SSE42-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
127 ; SSE42-NEXT: movdqa %xmm0, (%rdi)
132 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,2,3]
133 ; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
134 ; AVX-NEXT: vextractps $3, %xmm0, 16(%rdi)
135 ; AVX-NEXT: vmovaps %xmm1, (%rdi)
137 %r = shufflevector <4 x i32> %a, <4 x i32> %b, <5 x i32> <i32 0, i32 5, i32 1, i32 6, i32 3>
138 store <5 x i32> %r, ptr %p
142 define void @v5f32(<4 x float> %a, <4 x float> %b, ptr %p) nounwind {
145 ; SSE2-NEXT: movaps %xmm0, %xmm2
146 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[1,2]
147 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
148 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
149 ; SSE2-NEXT: movss %xmm0, 16(%rdi)
150 ; SSE2-NEXT: movaps %xmm2, (%rdi)
153 ; SSE42-LABEL: v5f32:
155 ; SSE42-NEXT: extractps $3, %xmm0, 16(%rdi)
156 ; SSE42-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,2]
157 ; SSE42-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
158 ; SSE42-NEXT: movaps %xmm0, (%rdi)
163 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,1],xmm1[1,2]
164 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
165 ; AVX-NEXT: vextractps $3, %xmm0, 16(%rdi)
166 ; AVX-NEXT: vmovaps %xmm1, (%rdi)
168 %r = shufflevector <4 x float> %a, <4 x float> %b, <5 x i32> <i32 0, i32 5, i32 1, i32 6, i32 3>
169 store <5 x float> %r, ptr %p
173 define void @v7i8(<4 x i8> %a, <4 x i8> %b, ptr %p) nounwind {
176 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
177 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,1,3,4,5,6,7]
178 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,0,255,0,255,255,255,255,255,255,255,255,255,255,255]
179 ; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
180 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
181 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,3,0,4,5,6,7]
182 ; SSE2-NEXT: pand %xmm2, %xmm1
183 ; SSE2-NEXT: pandn %xmm0, %xmm2
184 ; SSE2-NEXT: por %xmm1, %xmm2
185 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
186 ; SSE2-NEXT: movb %al, 6(%rdi)
187 ; SSE2-NEXT: movd %xmm2, (%rdi)
188 ; SSE2-NEXT: pextrw $2, %xmm2, %eax
189 ; SSE2-NEXT: movw %ax, 4(%rdi)
194 ; SSE42-NEXT: pextrb $0, %xmm1, 6(%rdi)
195 ; SSE42-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
196 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[1,4,7,4,3,6,0,u,u,u,u,u,u,u,u,u]
197 ; SSE42-NEXT: pextrw $2, %xmm1, 4(%rdi)
198 ; SSE42-NEXT: movd %xmm1, (%rdi)
203 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
204 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,4,7,4,3,6,0,u,u,u,u,u,u,u,u,u]
205 ; AVX1-NEXT: vpextrb $0, %xmm1, 6(%rdi)
206 ; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rdi)
207 ; AVX1-NEXT: vmovd %xmm0, (%rdi)
212 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
213 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,4,7,4,3,6,0,u,u,u,u,u,u,u,u,u]
214 ; AVX2-NEXT: vpextrb $0, %xmm1, 6(%rdi)
215 ; AVX2-NEXT: vpextrw $2, %xmm0, 4(%rdi)
216 ; AVX2-NEXT: vmovd %xmm0, (%rdi)
221 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[3],xmm1[2],xmm0[1],xmm1[3,0,u,u,u,u,u,u,u,u,u]
222 ; XOP-NEXT: vpextrb $0, %xmm1, 6(%rdi)
223 ; XOP-NEXT: vpextrw $2, %xmm0, 4(%rdi)
224 ; XOP-NEXT: vmovd %xmm0, (%rdi)
226 %r = shufflevector <4 x i8> %a, <4 x i8> %b, <7 x i32> <i32 0, i32 6, i32 3, i32 6, i32 1, i32 7, i32 4>
227 store <7 x i8> %r, ptr %p
231 define void @v7i16(<4 x i16> %a, <4 x i16> %b, ptr %p) nounwind {
234 ; SSE2-NEXT: movd %xmm1, %eax
235 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
236 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,0,3,4,5,6,7]
237 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
238 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
239 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,2,4,5,6,7]
240 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,6,4,7]
241 ; SSE2-NEXT: movw %ax, 12(%rdi)
242 ; SSE2-NEXT: movq %xmm0, (%rdi)
243 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
244 ; SSE2-NEXT: movd %xmm0, 8(%rdi)
247 ; SSE42-LABEL: v7i16:
249 ; SSE42-NEXT: pextrw $0, %xmm1, 12(%rdi)
250 ; SSE42-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
251 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2,3,8,9,14,15,8,9,6,7,12,13,0,1,14,15]
252 ; SSE42-NEXT: pextrd $2, %xmm1, 8(%rdi)
253 ; SSE42-NEXT: movq %xmm1, (%rdi)
258 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
259 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,8,9,14,15,8,9,6,7,12,13,0,1,14,15]
260 ; AVX1-NEXT: vpextrw $0, %xmm1, 12(%rdi)
261 ; AVX1-NEXT: vpextrd $2, %xmm0, 8(%rdi)
262 ; AVX1-NEXT: vmovq %xmm0, (%rdi)
267 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
268 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,8,9,14,15,8,9,6,7,12,13,0,1,14,15]
269 ; AVX2-NEXT: vpextrw $0, %xmm1, 12(%rdi)
270 ; AVX2-NEXT: vpextrd $2, %xmm0, 8(%rdi)
271 ; AVX2-NEXT: vmovq %xmm0, (%rdi)
276 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,1],xmm1[4,5],xmm0[6,7],xmm1[4,5],xmm0[2,3],xmm1[6,7,0,1],xmm0[6,7]
277 ; XOP-NEXT: vpextrw $0, %xmm1, 12(%rdi)
278 ; XOP-NEXT: vpextrd $2, %xmm0, 8(%rdi)
279 ; XOP-NEXT: vmovq %xmm0, (%rdi)
281 %r = shufflevector <4 x i16> %a, <4 x i16> %b, <7 x i32> <i32 0, i32 6, i32 3, i32 6, i32 1, i32 7, i32 4>
282 store <7 x i16> %r, ptr %p
287 define void @v7i32(<4 x i32> %a, <4 x i32> %b, ptr %p) nounwind {
290 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,2,2]
291 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,3]
292 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
293 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
294 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
295 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
296 ; SSE2-NEXT: movd %xmm1, 24(%rdi)
297 ; SSE2-NEXT: movq %xmm0, 16(%rdi)
298 ; SSE2-NEXT: movdqa %xmm3, (%rdi)
301 ; SSE42-LABEL: v7i32:
303 ; SSE42-NEXT: movdqa %xmm0, %xmm2
304 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
305 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,3,2]
306 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
307 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
308 ; SSE42-NEXT: movd %xmm1, 24(%rdi)
309 ; SSE42-NEXT: movq %xmm0, 16(%rdi)
310 ; SSE42-NEXT: movdqa %xmm2, (%rdi)
315 ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3]
316 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2,3,2]
317 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
318 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
319 ; AVX-NEXT: vmovss %xmm1, 24(%rdi)
320 ; AVX-NEXT: vmovlps %xmm0, 16(%rdi)
321 ; AVX-NEXT: vmovaps %xmm2, (%rdi)
323 %r = shufflevector <4 x i32> %a, <4 x i32> %b, <7 x i32> <i32 0, i32 6, i32 3, i32 6, i32 1, i32 7, i32 4>
324 store <7 x i32> %r, ptr %p
328 define void @v12i8(<8 x i8> %a, <8 x i8> %b, ptr %p) nounwind {
331 ; SSE2-NEXT: pxor %xmm2, %xmm2
332 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
333 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
334 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7]
335 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
336 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
337 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7]
338 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,4]
339 ; SSE2-NEXT: packuswb %xmm2, %xmm0
340 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,255]
341 ; SSE2-NEXT: pand %xmm2, %xmm0
342 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,1,1,4,5,6,7]
343 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,3]
344 ; SSE2-NEXT: pandn %xmm1, %xmm2
345 ; SSE2-NEXT: por %xmm0, %xmm2
346 ; SSE2-NEXT: movq %xmm2, (%rdi)
347 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
348 ; SSE2-NEXT: movd %xmm0, 8(%rdi)
351 ; SSE42-LABEL: v12i8:
353 ; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
354 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u]
355 ; SSE42-NEXT: pextrd $2, %xmm0, 8(%rdi)
356 ; SSE42-NEXT: movq %xmm0, (%rdi)
361 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
362 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u]
363 ; AVX1-NEXT: vpextrd $2, %xmm0, 8(%rdi)
364 ; AVX1-NEXT: vmovq %xmm0, (%rdi)
369 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
370 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u]
371 ; AVX2-NEXT: vpextrd $2, %xmm0, 8(%rdi)
372 ; AVX2-NEXT: vmovq %xmm0, (%rdi)
377 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,4],xmm1[0],xmm0[1,5],xmm1[1],xmm0[2,6],xmm1[2],xmm0[3,7],xmm1[3],xmm0[u,u,u,u]
378 ; XOP-NEXT: vpextrd $2, %xmm0, 8(%rdi)
379 ; XOP-NEXT: vmovq %xmm0, (%rdi)
381 %r = shufflevector <8 x i8> %a, <8 x i8> %b, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
382 store <12 x i8> %r, ptr %p
386 define void @v12i16(<8 x i16> %a, <8 x i16> %b, ptr %p) nounwind {
387 ; SSE2-LABEL: v12i16:
389 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
390 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535]
391 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,6,5,4,7]
392 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
393 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,1,4,5,6,7]
394 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,6,4]
395 ; SSE2-NEXT: pand %xmm3, %xmm4
396 ; SSE2-NEXT: pandn %xmm2, %xmm3
397 ; SSE2-NEXT: por %xmm4, %xmm3
398 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
399 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,65535,65535]
400 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
401 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,1,3,4,5,6,7]
402 ; SSE2-NEXT: pand %xmm2, %xmm0
403 ; SSE2-NEXT: pandn %xmm1, %xmm2
404 ; SSE2-NEXT: por %xmm0, %xmm2
405 ; SSE2-NEXT: movq %xmm2, 16(%rdi)
406 ; SSE2-NEXT: movdqa %xmm3, (%rdi)
409 ; SSE42-LABEL: v12i16:
411 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
412 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
413 ; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7]
414 ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7]
415 ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
416 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,2,3,10,11,u,u,4,5,12,13]
417 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
418 ; SSE42-NEXT: movdqa %xmm0, (%rdi)
419 ; SSE42-NEXT: movq %xmm3, 16(%rdi)
422 ; AVX1-LABEL: v12i16:
424 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
425 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
426 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7]
427 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7]
428 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
429 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,2,3,10,11,u,u,4,5,12,13]
430 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
431 ; AVX1-NEXT: vmovdqa %xmm0, (%rdi)
432 ; AVX1-NEXT: vmovq %xmm2, 16(%rdi)
435 ; AVX2-SLOW-LABEL: v12i16:
436 ; AVX2-SLOW: # %bb.0:
437 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
438 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
439 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7]
440 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7]
441 ; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm1
442 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,2,3,10,11,u,u,4,5,12,13]
443 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
444 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rdi)
445 ; AVX2-SLOW-NEXT: vmovq %xmm2, 16(%rdi)
446 ; AVX2-SLOW-NEXT: retq
448 ; AVX2-FAST-LABEL: v12i16:
449 ; AVX2-FAST: # %bb.0:
450 ; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %xmm2
451 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,8,9,u,u,2,3,10,11,u,u,4,5,12,13]
452 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4],xmm2[5],xmm3[6,7]
453 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
454 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u]
455 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
456 ; AVX2-FAST-NEXT: vmovq %xmm0, 16(%rdi)
457 ; AVX2-FAST-NEXT: vmovdqa %xmm2, (%rdi)
458 ; AVX2-FAST-NEXT: retq
462 ; XOP-NEXT: vpperm {{.*#+}} xmm2 = xmm0[0,1,8,9],xmm1[0,1],xmm0[2,3,10,11],xmm1[2,3],xmm0[4,5,12,13]
463 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[4,5],xmm0[6,7,14,15],xmm1[6,7],xmm0[u,u,u,u,u,u,u,u]
464 ; XOP-NEXT: vmovq %xmm0, 16(%rdi)
465 ; XOP-NEXT: vmovdqa %xmm2, (%rdi)
467 %r = shufflevector <8 x i16> %a, <8 x i16> %b, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
468 store <12 x i16> %r, ptr %p
472 define void @v12i32(<8 x i32> %a, <8 x i32> %b, ptr %p) nounwind {
473 ; SSE2-LABEL: v12i32:
475 ; SSE2-NEXT: movaps %xmm2, %xmm3
476 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[1,3]
477 ; SSE2-NEXT: movaps %xmm0, %xmm4
478 ; SSE2-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
479 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2]
480 ; SSE2-NEXT: movaps %xmm0, %xmm3
481 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
482 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3]
483 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm2[2,3]
484 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
485 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2]
486 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
487 ; SSE2-NEXT: movaps %xmm2, 16(%rdi)
488 ; SSE2-NEXT: movaps %xmm4, (%rdi)
489 ; SSE2-NEXT: movaps %xmm0, 32(%rdi)
492 ; SSE42-LABEL: v12i32:
494 ; SSE42-NEXT: movdqa %xmm0, %xmm3
495 ; SSE42-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
496 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,2]
497 ; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,0,1]
498 ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm3[0,1,2,3],xmm4[4,5],xmm3[6,7]
499 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,2]
500 ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4,5],xmm3[6,7]
501 ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5,6,7]
502 ; SSE42-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3]
503 ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
504 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
505 ; SSE42-NEXT: movdqa %xmm1, 32(%rdi)
506 ; SSE42-NEXT: movdqa %xmm3, 16(%rdi)
507 ; SSE42-NEXT: movdqa %xmm4, (%rdi)
510 ; AVX1-LABEL: v12i32:
512 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
513 ; AVX1-NEXT: vmovsldup {{.*#+}} ymm2 = ymm2[0,0,2,2,4,4,6,6]
514 ; AVX1-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[0,u,u,1,5,u,u,6]
515 ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4,5],ymm2[6],ymm3[7]
516 ; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,1,0,1]
517 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm3
518 ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
519 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
520 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3]
521 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
522 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
523 ; AVX1-NEXT: vmovaps %xmm0, 32(%rdi)
524 ; AVX1-NEXT: vmovaps %ymm2, (%rdi)
525 ; AVX1-NEXT: vzeroupper
528 ; AVX2-SLOW-LABEL: v12i32:
529 ; AVX2-SLOW: # %bb.0:
530 ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm2 = <0,4,u,1,5,u,2,6>
531 ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm2, %ymm2
532 ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm3
533 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
534 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3
535 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3]
536 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
537 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
538 ; AVX2-SLOW-NEXT: vmovaps %xmm0, 32(%rdi)
539 ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rdi)
540 ; AVX2-SLOW-NEXT: vzeroupper
541 ; AVX2-SLOW-NEXT: retq
543 ; AVX2-FAST-ALL-LABEL: v12i32:
544 ; AVX2-FAST-ALL: # %bb.0:
545 ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = <0,4,u,1,5,u,2,6>
546 ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm2
547 ; AVX2-FAST-ALL-NEXT: vbroadcastsd %xmm1, %ymm3
548 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
549 ; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm3 = [7,3,7,3,7,3,7,3]
550 ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm3, %ymm0
551 ; AVX2-FAST-ALL-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
552 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
553 ; AVX2-FAST-ALL-NEXT: vmovaps %xmm0, 32(%rdi)
554 ; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, (%rdi)
555 ; AVX2-FAST-ALL-NEXT: vzeroupper
556 ; AVX2-FAST-ALL-NEXT: retq
558 ; AVX2-FAST-PERLANE-LABEL: v12i32:
559 ; AVX2-FAST-PERLANE: # %bb.0:
560 ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm2 = <0,4,u,1,5,u,2,6>
561 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm2, %ymm2
562 ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm3
563 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
564 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm3
565 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3]
566 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
567 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
568 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, 32(%rdi)
569 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rdi)
570 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
571 ; AVX2-FAST-PERLANE-NEXT: retq
575 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
576 ; XOP-NEXT: vpermil2ps {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[u,1,5,u],ymm2[6],ymm0[6]
577 ; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,1,0,1]
578 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm3
579 ; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
580 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3
581 ; XOP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3]
582 ; XOP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
583 ; XOP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
584 ; XOP-NEXT: vmovaps %xmm0, 32(%rdi)
585 ; XOP-NEXT: vmovaps %ymm2, (%rdi)
586 ; XOP-NEXT: vzeroupper
588 %r = shufflevector <8 x i32> %a, <8 x i32> %b, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
589 store <12 x i32> %r, ptr %p
593 define void @pr29025(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, ptr%p) nounwind {
594 ; SSE2-LABEL: pr29025:
596 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
597 ; SSE2-NEXT: pxor %xmm1, %xmm1
598 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
599 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
600 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7]
601 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
602 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
603 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7]
604 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,4]
605 ; SSE2-NEXT: packuswb %xmm1, %xmm0
606 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,255]
607 ; SSE2-NEXT: pand %xmm1, %xmm0
608 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,1,1,4,5,6,7]
609 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,3]
610 ; SSE2-NEXT: pandn %xmm2, %xmm1
611 ; SSE2-NEXT: por %xmm0, %xmm1
612 ; SSE2-NEXT: movq %xmm1, (%rdi)
613 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
614 ; SSE2-NEXT: movd %xmm0, 8(%rdi)
617 ; SSE42-LABEL: pr29025:
619 ; SSE42-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
620 ; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
621 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u]
622 ; SSE42-NEXT: pextrd $2, %xmm0, 8(%rdi)
623 ; SSE42-NEXT: movq %xmm0, (%rdi)
626 ; AVX1-LABEL: pr29025:
628 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
629 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
630 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u]
631 ; AVX1-NEXT: vpextrd $2, %xmm0, 8(%rdi)
632 ; AVX1-NEXT: vmovq %xmm0, (%rdi)
635 ; AVX2-LABEL: pr29025:
637 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
638 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
639 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u]
640 ; AVX2-NEXT: vpextrd $2, %xmm0, 8(%rdi)
641 ; AVX2-NEXT: vmovq %xmm0, (%rdi)
644 ; XOP-LABEL: pr29025:
646 ; XOP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
647 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,4],xmm2[0],xmm0[1,5],xmm2[1],xmm0[2,6],xmm2[2],xmm0[3,7],xmm2[3],xmm0[u,u,u,u]
648 ; XOP-NEXT: vpextrd $2, %xmm0, 8(%rdi)
649 ; XOP-NEXT: vmovq %xmm0, (%rdi)
651 %s1 = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
652 %s2 = shufflevector <4 x i8> %c, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
653 %r = shufflevector <8 x i8> %s1, <8 x i8> %s2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
654 store <12 x i8> %r, ptr %p, align 1
658 define void @interleave_24i8_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
659 ; SSE2-LABEL: interleave_24i8_out:
661 ; SSE2-NEXT: movdqu (%rdi), %xmm0
662 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
663 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,0,255,255,255,255,255,255,255,255,255,255]
664 ; SSE2-NEXT: movdqa %xmm0, %xmm2
665 ; SSE2-NEXT: pand %xmm4, %xmm2
666 ; SSE2-NEXT: pandn %xmm1, %xmm4
667 ; SSE2-NEXT: por %xmm2, %xmm4
668 ; SSE2-NEXT: pxor %xmm2, %xmm2
669 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
670 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,0]
671 ; SSE2-NEXT: pand %xmm5, %xmm4
672 ; SSE2-NEXT: movdqa %xmm0, %xmm3
673 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
674 ; SSE2-NEXT: pandn %xmm3, %xmm5
675 ; SSE2-NEXT: por %xmm4, %xmm5
676 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,1,3]
677 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5]
678 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
679 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7]
680 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7]
681 ; SSE2-NEXT: packuswb %xmm4, %xmm4
682 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,255,255,0,255,255,0,255,255,255,255,255,255,255,255,255]
683 ; SSE2-NEXT: movdqa %xmm0, %xmm6
684 ; SSE2-NEXT: pand %xmm5, %xmm6
685 ; SSE2-NEXT: pandn %xmm1, %xmm5
686 ; SSE2-NEXT: por %xmm6, %xmm5
687 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
688 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535]
689 ; SSE2-NEXT: pand %xmm6, %xmm5
690 ; SSE2-NEXT: pandn %xmm3, %xmm6
691 ; SSE2-NEXT: por %xmm5, %xmm6
692 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[2,1,0,3,4,5,6,7]
693 ; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7]
694 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,1]
695 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,3,0,4,5,6,7]
696 ; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4]
697 ; SSE2-NEXT: packuswb %xmm5, %xmm5
698 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255]
699 ; SSE2-NEXT: pand %xmm6, %xmm0
700 ; SSE2-NEXT: pandn %xmm1, %xmm6
701 ; SSE2-NEXT: por %xmm0, %xmm6
702 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
703 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535]
704 ; SSE2-NEXT: pand %xmm0, %xmm6
705 ; SSE2-NEXT: pandn %xmm3, %xmm0
706 ; SSE2-NEXT: por %xmm6, %xmm0
707 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
708 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
709 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
710 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
711 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
712 ; SSE2-NEXT: packuswb %xmm0, %xmm0
713 ; SSE2-NEXT: movq %xmm4, (%rsi)
714 ; SSE2-NEXT: movq %xmm5, (%rdx)
715 ; SSE2-NEXT: movq %xmm0, (%rcx)
718 ; SSE42-LABEL: interleave_24i8_out:
720 ; SSE42-NEXT: movdqu (%rdi), %xmm0
721 ; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
722 ; SSE42-NEXT: movdqa %xmm1, %xmm2
723 ; SSE42-NEXT: pshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[2,5,u,u,u,u,u,u,u,u]
724 ; SSE42-NEXT: movdqa %xmm0, %xmm3
725 ; SSE42-NEXT: pshufb {{.*#+}} xmm3 = xmm3[0,3,6,9,12,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
726 ; SSE42-NEXT: por %xmm2, %xmm3
727 ; SSE42-NEXT: movdqa %xmm1, %xmm2
728 ; SSE42-NEXT: pshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[0,3,6,u,u,u,u,u,u,u,u]
729 ; SSE42-NEXT: movdqa %xmm0, %xmm4
730 ; SSE42-NEXT: pshufb {{.*#+}} xmm4 = xmm4[1,4,7,10,13],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u]
731 ; SSE42-NEXT: por %xmm2, %xmm4
732 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
733 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
734 ; SSE42-NEXT: por %xmm1, %xmm0
735 ; SSE42-NEXT: movq %xmm3, (%rsi)
736 ; SSE42-NEXT: movq %xmm4, (%rdx)
737 ; SSE42-NEXT: movq %xmm0, (%rcx)
740 ; AVX1-LABEL: interleave_24i8_out:
742 ; AVX1-NEXT: vmovdqu (%rdi), %xmm0
743 ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
744 ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
745 ; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
746 ; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2
747 ; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
748 ; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
749 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
750 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
751 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
752 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
753 ; AVX1-NEXT: vmovq %xmm2, (%rsi)
754 ; AVX1-NEXT: vmovq %xmm3, (%rdx)
755 ; AVX1-NEXT: vmovq %xmm0, (%rcx)
758 ; AVX2-LABEL: interleave_24i8_out:
760 ; AVX2-NEXT: vmovdqu (%rdi), %xmm0
761 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
762 ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
763 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
764 ; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2
765 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
766 ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
767 ; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3
768 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
769 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
770 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
771 ; AVX2-NEXT: vmovq %xmm2, (%rsi)
772 ; AVX2-NEXT: vmovq %xmm3, (%rdx)
773 ; AVX2-NEXT: vmovq %xmm0, (%rcx)
776 ; XOP-LABEL: interleave_24i8_out:
778 ; XOP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
779 ; XOP-NEXT: vmovdqu (%rdi), %xmm1
780 ; XOP-NEXT: vpperm {{.*#+}} xmm2 = xmm1[0,3,6,9,12,15],xmm0[2,5],xmm1[u,u,u,u,u,u,u,u]
781 ; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm1[1,4,7,10,13],xmm0[0,3,6],xmm1[u,u,u,u,u,u,u,u]
782 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[2,5,8,11,14],xmm0[1,4,7],xmm1[u,u,u,u,u,u,u,u]
783 ; XOP-NEXT: vmovq %xmm2, (%rsi)
784 ; XOP-NEXT: vmovq %xmm3, (%rdx)
785 ; XOP-NEXT: vmovq %xmm0, (%rcx)
787 %wide.vec = load <24 x i8>, ptr %p, align 4
788 %s1 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
789 %s2 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
790 %s3 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
791 store <8 x i8> %s1, ptr %q1, align 4
792 store <8 x i8> %s2, ptr %q2, align 4
793 store <8 x i8> %s3, ptr %q3, align 4
797 define void @interleave_24i8_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
798 ; SSE2-LABEL: interleave_24i8_in:
800 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
801 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
802 ; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
803 ; SSE2-NEXT: pxor %xmm3, %xmm3
804 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
805 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,2,2]
806 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535]
807 ; SSE2-NEXT: pand %xmm5, %xmm4
808 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
809 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7]
810 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
811 ; SSE2-NEXT: pandn %xmm3, %xmm5
812 ; SSE2-NEXT: por %xmm4, %xmm5
813 ; SSE2-NEXT: movdqa %xmm2, %xmm3
814 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
815 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
816 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7]
817 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,4,5]
818 ; SSE2-NEXT: packuswb %xmm5, %xmm3
819 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255]
820 ; SSE2-NEXT: pand %xmm4, %xmm3
821 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,0,1]
822 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,0,0,0,4,5,6,7]
823 ; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,6,6]
824 ; SSE2-NEXT: pandn %xmm5, %xmm4
825 ; SSE2-NEXT: por %xmm3, %xmm4
826 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
827 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
828 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,1,0,4,5,6,7]
829 ; SSE2-NEXT: packuswb %xmm1, %xmm1
830 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255]
831 ; SSE2-NEXT: pand %xmm2, %xmm1
832 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7]
833 ; SSE2-NEXT: pandn %xmm0, %xmm2
834 ; SSE2-NEXT: por %xmm1, %xmm2
835 ; SSE2-NEXT: movq %xmm2, 16(%rdi)
836 ; SSE2-NEXT: movdqu %xmm4, (%rdi)
839 ; SSE42-LABEL: interleave_24i8_in:
841 ; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
842 ; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
843 ; SSE42-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
844 ; SSE42-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
845 ; SSE42-NEXT: movdqa %xmm2, %xmm1
846 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,8],zero,xmm1[1,9],zero,xmm1[2,10],zero,xmm1[3,11],zero,xmm1[4,12],zero,xmm1[5]
847 ; SSE42-NEXT: movdqa %xmm0, %xmm3
848 ; SSE42-NEXT: pshufb {{.*#+}} xmm3 = zero,zero,xmm3[0],zero,zero,xmm3[1],zero,zero,xmm3[2],zero,zero,xmm3[3],zero,zero,xmm3[4],zero
849 ; SSE42-NEXT: por %xmm1, %xmm3
850 ; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[13],zero,xmm2[6,14],zero,xmm2[7,15],zero,xmm2[u,u,u,u,u,u,u,u]
851 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = zero,xmm0[5],zero,zero,xmm0[6],zero,zero,xmm0[7,u,u,u,u,u,u,u,u]
852 ; SSE42-NEXT: por %xmm2, %xmm0
853 ; SSE42-NEXT: movq %xmm0, 16(%rdi)
854 ; SSE42-NEXT: movdqu %xmm3, (%rdi)
857 ; AVX1-LABEL: interleave_24i8_in:
859 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
860 ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
861 ; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
862 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
863 ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[0,8],zero,xmm1[1,9],zero,xmm1[2,10],zero,xmm1[3,11],zero,xmm1[4,12],zero,xmm1[5]
864 ; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm0[0],zero,zero,xmm0[1],zero,zero,xmm0[2],zero,zero,xmm0[3],zero,zero,xmm0[4],zero
865 ; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
866 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[13],zero,xmm1[6,14],zero,xmm1[7,15],zero,xmm1[u,u,u,u,u,u,u,u]
867 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[5],zero,zero,xmm0[6],zero,zero,xmm0[7,u,u,u,u,u,u,u,u]
868 ; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
869 ; AVX1-NEXT: vmovq %xmm0, 16(%rdi)
870 ; AVX1-NEXT: vmovdqu %xmm2, (%rdi)
873 ; AVX2-LABEL: interleave_24i8_in:
875 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
876 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
877 ; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
878 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
879 ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[0,8],zero,xmm1[1,9],zero,xmm1[2,10],zero,xmm1[3,11],zero,xmm1[4,12],zero,xmm1[5]
880 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm0[0],zero,zero,xmm0[1],zero,zero,xmm0[2],zero,zero,xmm0[3],zero,zero,xmm0[4],zero
881 ; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2
882 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[13],zero,xmm1[6,14],zero,xmm1[7,15],zero,xmm1[u,u,u,u,u,u,u,u]
883 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[5],zero,zero,xmm0[6],zero,zero,xmm0[7,u,u,u,u,u,u,u,u]
884 ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
885 ; AVX2-NEXT: vmovq %xmm0, 16(%rdi)
886 ; AVX2-NEXT: vmovdqu %xmm2, (%rdi)
889 ; XOP-LABEL: interleave_24i8_in:
891 ; XOP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
892 ; XOP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
893 ; XOP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
894 ; XOP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
895 ; XOP-NEXT: vpperm {{.*#+}} xmm2 = xmm1[0,8],xmm0[0],xmm1[1,9],xmm0[1],xmm1[2,10],xmm0[2],xmm1[3,11],xmm0[3],xmm1[4,12],xmm0[4],xmm1[5]
896 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[13],xmm0[5],xmm1[6,14],xmm0[6],xmm1[7,15],xmm0[7],xmm1[u,u,u,u,u,u,u,u]
897 ; XOP-NEXT: vmovq %xmm0, 16(%rdi)
898 ; XOP-NEXT: vmovdqu %xmm2, (%rdi)
900 %s1 = load <8 x i8>, ptr %q1, align 4
901 %s2 = load <8 x i8>, ptr %q2, align 4
902 %s3 = load <8 x i8>, ptr %q3, align 4
903 %t1 = shufflevector <8 x i8> %s1, <8 x i8> %s2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
904 %t2 = shufflevector <8 x i8> %s3, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
905 %interleaved = shufflevector <16 x i8> %t1, <16 x i8> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
906 store <24 x i8> %interleaved, ptr %p, align 4
911 define void @interleave_24i16_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
912 ; SSE2-LABEL: interleave_24i16_out:
914 ; SSE2-NEXT: movdqu (%rdi), %xmm3
915 ; SSE2-NEXT: movdqu 16(%rdi), %xmm2
916 ; SSE2-NEXT: movdqu 32(%rdi), %xmm0
917 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0]
918 ; SSE2-NEXT: movdqa %xmm3, %xmm4
919 ; SSE2-NEXT: pand %xmm1, %xmm4
920 ; SSE2-NEXT: pandn %xmm2, %xmm1
921 ; SSE2-NEXT: por %xmm4, %xmm1
922 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
923 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
924 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
925 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
926 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,7,6,7]
927 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,2,1]
928 ; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5]
929 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm4[2,0]
930 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0]
931 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535]
932 ; SSE2-NEXT: movdqa %xmm4, %xmm5
933 ; SSE2-NEXT: pandn %xmm2, %xmm5
934 ; SSE2-NEXT: movdqa %xmm3, %xmm6
935 ; SSE2-NEXT: pand %xmm4, %xmm6
936 ; SSE2-NEXT: por %xmm5, %xmm6
937 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[2,1,2,3,4,5,6,7]
938 ; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
939 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
940 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,3,0,4,5,6,7]
941 ; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
942 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,0,0]
943 ; SSE2-NEXT: pand %xmm6, %xmm5
944 ; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[0,3,2,3,4,5,6,7]
945 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
946 ; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6]
947 ; SSE2-NEXT: movdqa %xmm6, %xmm8
948 ; SSE2-NEXT: pandn %xmm7, %xmm8
949 ; SSE2-NEXT: por %xmm5, %xmm8
950 ; SSE2-NEXT: pand %xmm4, %xmm2
951 ; SSE2-NEXT: pandn %xmm3, %xmm4
952 ; SSE2-NEXT: por %xmm2, %xmm4
953 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[3,1,2,0]
954 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
955 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3]
956 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7]
957 ; SSE2-NEXT: pand %xmm6, %xmm2
958 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
959 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
960 ; SSE2-NEXT: pandn %xmm0, %xmm6
961 ; SSE2-NEXT: por %xmm2, %xmm6
962 ; SSE2-NEXT: movups %xmm1, (%rsi)
963 ; SSE2-NEXT: movdqu %xmm8, (%rdx)
964 ; SSE2-NEXT: movdqu %xmm6, (%rcx)
967 ; SSE42-LABEL: interleave_24i16_out:
969 ; SSE42-NEXT: movdqu (%rdi), %xmm0
970 ; SSE42-NEXT: movdqu 16(%rdi), %xmm1
971 ; SSE42-NEXT: movdqu 32(%rdi), %xmm2
972 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,2,1]
973 ; SSE42-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5]
974 ; SSE42-NEXT: movdqa %xmm0, %xmm4
975 ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6],xmm1[7]
976 ; SSE42-NEXT: pshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
977 ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm3[6,7]
978 ; SSE42-NEXT: movdqa %xmm2, %xmm3
979 ; SSE42-NEXT: pshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,0,1,6,7,12,13]
980 ; SSE42-NEXT: movdqa %xmm0, %xmm5
981 ; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3,4],xmm1[5],xmm5[6,7]
982 ; SSE42-NEXT: pshufb {{.*#+}} xmm5 = xmm5[2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u]
983 ; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm3[5,6,7]
984 ; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,2,3,8,9,14,15]
985 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
986 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u]
987 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
988 ; SSE42-NEXT: movdqu %xmm4, (%rsi)
989 ; SSE42-NEXT: movdqu %xmm5, (%rdx)
990 ; SSE42-NEXT: movdqu %xmm1, (%rcx)
993 ; AVX1-LABEL: interleave_24i16_out:
995 ; AVX1-NEXT: vmovdqu (%rdi), %xmm0
996 ; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1
997 ; AVX1-NEXT: vmovdqu 32(%rdi), %xmm2
998 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1]
999 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5]
1000 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
1001 ; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
1002 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5],xmm3[6,7]
1003 ; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,0,1,6,7,12,13]
1004 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
1005 ; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u]
1006 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4],xmm4[5,6,7]
1007 ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,2,3,8,9,14,15]
1008 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
1009 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u]
1010 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
1011 ; AVX1-NEXT: vmovdqu %xmm3, (%rsi)
1012 ; AVX1-NEXT: vmovdqu %xmm4, (%rdx)
1013 ; AVX1-NEXT: vmovdqu %xmm0, (%rcx)
1016 ; AVX2-LABEL: interleave_24i16_out:
1018 ; AVX2-NEXT: vmovdqu (%rdi), %xmm0
1019 ; AVX2-NEXT: vmovdqu 16(%rdi), %xmm1
1020 ; AVX2-NEXT: vmovdqu 32(%rdi), %xmm2
1021 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7]
1022 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7]
1023 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
1024 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7]
1025 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm1[2],xmm4[3,4],xmm1[5],xmm4[6,7]
1026 ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
1027 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6],xmm2[7]
1028 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
1029 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15]
1030 ; AVX2-NEXT: vmovdqu %xmm3, (%rsi)
1031 ; AVX2-NEXT: vmovdqu %xmm4, (%rdx)
1032 ; AVX2-NEXT: vmovdqu %xmm0, (%rcx)
1035 ; XOP-LABEL: interleave_24i16_out:
1037 ; XOP-NEXT: vmovdqu (%rdi), %xmm0
1038 ; XOP-NEXT: vmovdqu 16(%rdi), %xmm1
1039 ; XOP-NEXT: vmovdqu 32(%rdi), %xmm2
1040 ; XOP-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
1041 ; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15],xmm2[4,5,10,11]
1042 ; XOP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
1043 ; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm4[2,3,8,9,14,15,4,5,10,11],xmm2[0,1,6,7,12,13]
1044 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
1045 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13],xmm2[2,3,8,9,14,15]
1046 ; XOP-NEXT: vmovdqu %xmm3, (%rsi)
1047 ; XOP-NEXT: vmovdqu %xmm4, (%rdx)
1048 ; XOP-NEXT: vmovdqu %xmm0, (%rcx)
1050 %wide.vec = load <24 x i16>, ptr %p, align 4
1051 %s1 = shufflevector <24 x i16> %wide.vec, <24 x i16> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
1052 %s2 = shufflevector <24 x i16> %wide.vec, <24 x i16> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
1053 %s3 = shufflevector <24 x i16> %wide.vec, <24 x i16> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
1054 store <8 x i16> %s1, ptr %q1, align 4
1055 store <8 x i16> %s2, ptr %q2, align 4
1056 store <8 x i16> %s3, ptr %q3, align 4
1060 define void @interleave_24i16_out_reverse(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
1061 ; SSE2-LABEL: interleave_24i16_out_reverse:
1063 ; SSE2-NEXT: movdqu (%rdi), %xmm0
1064 ; SSE2-NEXT: movdqu 16(%rdi), %xmm1
1065 ; SSE2-NEXT: movdqu 32(%rdi), %xmm3
1066 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,0]
1067 ; SSE2-NEXT: movdqa %xmm1, %xmm4
1068 ; SSE2-NEXT: pand %xmm2, %xmm4
1069 ; SSE2-NEXT: pandn %xmm3, %xmm2
1070 ; SSE2-NEXT: por %xmm4, %xmm2
1071 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
1072 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7]
1073 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
1074 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,5,6,6,7]
1075 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,2,1]
1076 ; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,6]
1077 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm4[2,0]
1078 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7]
1079 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,0]
1080 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535]
1081 ; SSE2-NEXT: movdqa %xmm4, %xmm5
1082 ; SSE2-NEXT: pandn %xmm1, %xmm5
1083 ; SSE2-NEXT: movdqa %xmm3, %xmm6
1084 ; SSE2-NEXT: pand %xmm4, %xmm6
1085 ; SSE2-NEXT: por %xmm5, %xmm6
1086 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[0,3,2,3,4,5,6,7]
1087 ; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7]
1088 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,1]
1089 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,0,3,4,5,6,7]
1090 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,0,0]
1091 ; SSE2-NEXT: pand %xmm6, %xmm5
1092 ; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,7,6,7]
1093 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,2,0]
1094 ; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7]
1095 ; SSE2-NEXT: movdqa %xmm6, %xmm8
1096 ; SSE2-NEXT: pandn %xmm7, %xmm8
1097 ; SSE2-NEXT: por %xmm5, %xmm8
1098 ; SSE2-NEXT: pand %xmm4, %xmm1
1099 ; SSE2-NEXT: pandn %xmm3, %xmm4
1100 ; SSE2-NEXT: por %xmm1, %xmm4
1101 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,1,2,0]
1102 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
1103 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1104 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,0,1,2,4,5,6,7]
1105 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
1106 ; SSE2-NEXT: pand %xmm6, %xmm1
1107 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
1108 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
1109 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
1110 ; SSE2-NEXT: pandn %xmm0, %xmm6
1111 ; SSE2-NEXT: por %xmm1, %xmm6
1112 ; SSE2-NEXT: movups %xmm2, (%rsi)
1113 ; SSE2-NEXT: movdqu %xmm8, (%rdx)
1114 ; SSE2-NEXT: movdqu %xmm6, (%rcx)
1117 ; SSE42-LABEL: interleave_24i16_out_reverse:
1119 ; SSE42-NEXT: movdqu (%rdi), %xmm0
1120 ; SSE42-NEXT: movdqu 16(%rdi), %xmm1
1121 ; SSE42-NEXT: movdqu 32(%rdi), %xmm2
1122 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,2,1]
1123 ; SSE42-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6]
1124 ; SSE42-NEXT: movdqa %xmm1, %xmm4
1125 ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2,3],xmm2[4],xmm4[5,6],xmm2[7]
1126 ; SSE42-NEXT: pshufb {{.*#+}} xmm4 = xmm4[14,15,8,9,2,3,12,13,6,7,0,1,u,u,u,u]
1127 ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm3[6,7]
1128 ; SSE42-NEXT: movdqa %xmm0, %xmm3
1129 ; SSE42-NEXT: pshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,14,15,8,9,2,3]
1130 ; SSE42-NEXT: movdqa %xmm2, %xmm5
1131 ; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3,4],xmm1[5],xmm5[6,7]
1132 ; SSE42-NEXT: pshufb {{.*#+}} xmm5 = xmm5[12,13,6,7,0,1,10,11,4,5,u,u,u,u,u,u]
1133 ; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm3[5,6,7]
1134 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,12,13,6,7,0,1]
1135 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
1136 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[10,11,4,5,14,15,8,9,2,3,u,u,u,u,u,u]
1137 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7]
1138 ; SSE42-NEXT: movdqu %xmm4, (%rsi)
1139 ; SSE42-NEXT: movdqu %xmm5, (%rdx)
1140 ; SSE42-NEXT: movdqu %xmm1, (%rcx)
1143 ; AVX1-LABEL: interleave_24i16_out_reverse:
1145 ; AVX1-NEXT: vmovdqu (%rdi), %xmm0
1146 ; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1
1147 ; AVX1-NEXT: vmovdqu 32(%rdi), %xmm2
1148 ; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[14,15,8,9,2,3,u,u,u,u,u,u,u,u,u,u]
1149 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
1150 ; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,12,13,6,7,0,1,10,11,4,5]
1151 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3,4,5,6,7]
1152 ; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[12,13,6,7,0,1,u,u,u,u,u,u,u,u,u,u]
1153 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
1154 ; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,10,11,4,5,14,15,8,9,2,3]
1155 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3,4,5,6,7]
1156 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
1157 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7]
1158 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
1159 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,14,15,8,9,2,3,12,13,6,7,0,1]
1160 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5,6,7]
1161 ; AVX1-NEXT: vmovdqu %xmm3, (%rsi)
1162 ; AVX1-NEXT: vmovdqu %xmm4, (%rdx)
1163 ; AVX1-NEXT: vmovdqu %xmm0, (%rcx)
1166 ; AVX2-LABEL: interleave_24i16_out_reverse:
1168 ; AVX2-NEXT: vmovdqu (%rdi), %xmm0
1169 ; AVX2-NEXT: vmovdqu 16(%rdi), %xmm1
1170 ; AVX2-NEXT: vmovdqu 32(%rdi), %xmm2
1171 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6],xmm2[7]
1172 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7]
1173 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[14,15,8,9,2,3,12,13,6,7,0,1,10,11,4,5]
1174 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7]
1175 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm1[2],xmm4[3,4],xmm1[5],xmm4[6,7]
1176 ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[12,13,6,7,0,1,10,11,4,5,14,15,8,9,2,3]
1177 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7]
1178 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
1179 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,4,5,14,15,8,9,2,3,12,13,6,7,0,1]
1180 ; AVX2-NEXT: vmovdqu %xmm3, (%rsi)
1181 ; AVX2-NEXT: vmovdqu %xmm4, (%rdx)
1182 ; AVX2-NEXT: vmovdqu %xmm0, (%rcx)
1185 ; XOP-LABEL: interleave_24i16_out_reverse:
1187 ; XOP-NEXT: vmovdqu (%rdi), %xmm0
1188 ; XOP-NEXT: vmovdqu 16(%rdi), %xmm1
1189 ; XOP-NEXT: vmovdqu 32(%rdi), %xmm2
1190 ; XOP-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
1191 ; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm2[14,15,8,9,2,3],xmm3[12,13,6,7,0,1,10,11,4,5]
1192 ; XOP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
1193 ; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm2[12,13,6,7,0,1],xmm4[10,11,4,5,14,15,8,9,2,3]
1194 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
1195 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm2[10,11,4,5],xmm0[14,15,8,9,2,3,12,13,6,7,0,1]
1196 ; XOP-NEXT: vmovdqu %xmm3, (%rsi)
1197 ; XOP-NEXT: vmovdqu %xmm4, (%rdx)
1198 ; XOP-NEXT: vmovdqu %xmm0, (%rcx)
1200 %wide.vec.reverse = load <24 x i16>, ptr %p, align 4
1201 %wide.vec = shufflevector <24 x i16> %wide.vec.reverse, <24 x i16> undef, <24 x i32> <i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1202 %s1 = shufflevector <24 x i16> %wide.vec, <24 x i16> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
1203 %s2 = shufflevector <24 x i16> %wide.vec, <24 x i16> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
1204 %s3 = shufflevector <24 x i16> %wide.vec, <24 x i16> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
1205 store <8 x i16> %s1, ptr %q1, align 4
1206 store <8 x i16> %s2, ptr %q2, align 4
1207 store <8 x i16> %s3, ptr %q3, align 4
1211 define void @interleave_24i16_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
1212 ; SSE2-LABEL: interleave_24i16_in:
1214 ; SSE2-NEXT: movdqu (%rsi), %xmm0
1215 ; SSE2-NEXT: movdqu (%rdx), %xmm2
1216 ; SSE2-NEXT: movdqu (%rcx), %xmm3
1217 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,0,0]
1218 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535]
1219 ; SSE2-NEXT: movdqa %xmm4, %xmm5
1220 ; SSE2-NEXT: pandn %xmm1, %xmm5
1221 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1222 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1223 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
1224 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7]
1225 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,5]
1226 ; SSE2-NEXT: pand %xmm4, %xmm1
1227 ; SSE2-NEXT: por %xmm5, %xmm1
1228 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,2,2]
1229 ; SSE2-NEXT: pand %xmm4, %xmm5
1230 ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[3,3,3,3,4,5,6,7]
1231 ; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4]
1232 ; SSE2-NEXT: pandn %xmm6, %xmm4
1233 ; SSE2-NEXT: por %xmm5, %xmm4
1234 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,0,65535,65535,0,65535]
1235 ; SSE2-NEXT: pand %xmm5, %xmm4
1236 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,2,2]
1237 ; SSE2-NEXT: pandn %xmm6, %xmm5
1238 ; SSE2-NEXT: por %xmm4, %xmm5
1239 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
1240 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,0,65535,65535,0]
1241 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
1242 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,1,3,3]
1243 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,1,0,4,5,6,7]
1244 ; SSE2-NEXT: pand %xmm4, %xmm0
1245 ; SSE2-NEXT: pandn %xmm3, %xmm4
1246 ; SSE2-NEXT: por %xmm0, %xmm4
1247 ; SSE2-NEXT: movdqu %xmm4, 32(%rdi)
1248 ; SSE2-NEXT: movdqu %xmm5, 16(%rdi)
1249 ; SSE2-NEXT: movdqu %xmm1, (%rdi)
1252 ; SSE42-LABEL: interleave_24i16_in:
1254 ; SSE42-NEXT: movdqu (%rsi), %xmm0
1255 ; SSE42-NEXT: movdqu (%rdx), %xmm1
1256 ; SSE42-NEXT: movdqu (%rcx), %xmm2
1257 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,2]
1258 ; SSE42-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[3,3,3,3,4,5,6,7]
1259 ; SSE42-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4]
1260 ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
1261 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,2,2]
1262 ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7]
1263 ; SSE42-NEXT: movdqa %xmm0, %xmm4
1264 ; SSE42-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
1265 ; SSE42-NEXT: pshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,u,u,4,5,6,7,u,u,8,9,10,11]
1266 ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,0,0]
1267 ; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7]
1268 ; SSE42-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1269 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
1270 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3]
1271 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7]
1272 ; SSE42-NEXT: movdqu %xmm0, 32(%rdi)
1273 ; SSE42-NEXT: movdqu %xmm5, (%rdi)
1274 ; SSE42-NEXT: movdqu %xmm3, 16(%rdi)
1277 ; AVX1-LABEL: interleave_24i16_in:
1279 ; AVX1-NEXT: vmovdqu (%rsi), %xmm0
1280 ; AVX1-NEXT: vmovdqu (%rdx), %xmm1
1281 ; AVX1-NEXT: vmovdqu (%rcx), %xmm2
1282 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2]
1283 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[3,3,3,3,4,5,6,7]
1284 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4]
1285 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
1286 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2]
1287 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7]
1288 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1289 ; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
1290 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,2,3,3]
1291 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5,6],xmm5[7]
1292 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1293 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,u,u,4,5,6,7,u,u,8,9,10,11]
1294 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
1295 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
1296 ; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
1297 ; AVX1-NEXT: vmovdqu %xmm4, 32(%rdi)
1298 ; AVX1-NEXT: vmovdqu %xmm3, 16(%rdi)
1301 ; AVX2-SLOW-LABEL: interleave_24i16_in:
1302 ; AVX2-SLOW: # %bb.0:
1303 ; AVX2-SLOW-NEXT: vmovdqu (%rsi), %xmm0
1304 ; AVX2-SLOW-NEXT: vmovdqu (%rdx), %xmm1
1305 ; AVX2-SLOW-NEXT: vmovdqu (%rcx), %xmm2
1306 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
1307 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,u,u,6,7,2,3,u,u,8,9,4,5,u,u,16,17,u,u,22,23,18,19,u,u,24,25,20,21,u,u]
1308 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
1309 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27]
1310 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15]
1311 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
1312 ; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm4, %ymm4
1313 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
1314 ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
1315 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1316 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
1317 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
1318 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
1319 ; AVX2-SLOW-NEXT: vmovdqu %xmm0, 32(%rdi)
1320 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, (%rdi)
1321 ; AVX2-SLOW-NEXT: vzeroupper
1322 ; AVX2-SLOW-NEXT: retq
1324 ; AVX2-FAST-ALL-LABEL: interleave_24i16_in:
1325 ; AVX2-FAST-ALL: # %bb.0:
1326 ; AVX2-FAST-ALL-NEXT: vmovdqu (%rsi), %xmm0
1327 ; AVX2-FAST-ALL-NEXT: vmovdqu (%rdx), %xmm1
1328 ; AVX2-FAST-ALL-NEXT: vmovdqu (%rcx), %xmm2
1329 ; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
1330 ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
1331 ; AVX2-FAST-ALL-NEXT: vpermd %ymm2, %ymm4, %ymm4
1332 ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = [0,4,1,5,1,5,2,6]
1333 ; AVX2-FAST-ALL-NEXT: vpermd %ymm3, %ymm5, %ymm3
1334 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,4,5,u,u,2,3,6,7,u,u,8,9,12,13,u,u,18,19,22,23,u,u,24,25,28,29,u,u,26,27]
1335 ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
1336 ; AVX2-FAST-ALL-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
1337 ; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1338 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
1339 ; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
1340 ; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
1341 ; AVX2-FAST-ALL-NEXT: vmovdqu %xmm0, 32(%rdi)
1342 ; AVX2-FAST-ALL-NEXT: vmovdqu %ymm3, (%rdi)
1343 ; AVX2-FAST-ALL-NEXT: vzeroupper
1344 ; AVX2-FAST-ALL-NEXT: retq
1346 ; AVX2-FAST-PERLANE-LABEL: interleave_24i16_in:
1347 ; AVX2-FAST-PERLANE: # %bb.0:
1348 ; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsi), %xmm0
1349 ; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rdx), %xmm1
1350 ; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rcx), %xmm2
1351 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
1352 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,u,u,6,7,2,3,u,u,8,9,4,5,u,u,16,17,u,u,22,23,18,19,u,u,24,25,20,21,u,u]
1353 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
1354 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27]
1355 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15]
1356 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
1357 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm4, %ymm4
1358 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
1359 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
1360 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1361 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
1362 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
1363 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
1364 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %xmm0, 32(%rdi)
1365 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, (%rdi)
1366 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
1367 ; AVX2-FAST-PERLANE-NEXT: retq
1369 ; XOP-LABEL: interleave_24i16_in:
1371 ; XOP-NEXT: vmovdqu (%rsi), %xmm0
1372 ; XOP-NEXT: vmovdqu (%rdx), %xmm1
1373 ; XOP-NEXT: vmovdqu (%rcx), %xmm2
1374 ; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm0[u,u,6,7],xmm1[6,7],xmm0[u,u,8,9],xmm1[8,9],xmm0[u,u,10,11]
1375 ; XOP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2]
1376 ; XOP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7]
1377 ; XOP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1378 ; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm2[0,1],xmm4[4,5,6,7],xmm2[2,3],xmm4[8,9,10,11]
1379 ; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
1380 ; XOP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1381 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[4,5],xmm2[10,11],xmm0[10,11,8,9],xmm2[12,13],xmm0[14,15,12,13],xmm2[14,15]
1382 ; XOP-NEXT: vmovdqu %xmm0, 32(%rdi)
1383 ; XOP-NEXT: vmovups %ymm3, (%rdi)
1384 ; XOP-NEXT: vzeroupper
1386 %s1 = load <8 x i16>, ptr %q1, align 4
1387 %s2 = load <8 x i16>, ptr %q2, align 4
1388 %s3 = load <8 x i16>, ptr %q3, align 4
1389 %t1 = shufflevector <8 x i16> %s1, <8 x i16> %s2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1390 %t2 = shufflevector <8 x i16> %s3, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1391 %interleaved = shufflevector <16 x i16> %t1, <16 x i16> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
1392 store <24 x i16> %interleaved, ptr %p, align 4
1396 define void @interleave_24i32_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
1397 ; SSE2-LABEL: interleave_24i32_out:
1399 ; SSE2-NEXT: movdqu 64(%rdi), %xmm2
1400 ; SSE2-NEXT: movups 80(%rdi), %xmm4
1401 ; SSE2-NEXT: movdqu (%rdi), %xmm0
1402 ; SSE2-NEXT: movdqu 16(%rdi), %xmm3
1403 ; SSE2-NEXT: movups 32(%rdi), %xmm6
1404 ; SSE2-NEXT: movdqu 48(%rdi), %xmm1
1405 ; SSE2-NEXT: movaps %xmm6, %xmm7
1406 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3]
1407 ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,1,1]
1408 ; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1]
1409 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3]
1410 ; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm3[2,0]
1411 ; SSE2-NEXT: movdqa %xmm0, %xmm8
1412 ; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm6[2,0]
1413 ; SSE2-NEXT: movaps %xmm4, %xmm6
1414 ; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,3,2,3]
1415 ; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,1,1]
1416 ; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
1417 ; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm4[0,3]
1418 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[2,0]
1419 ; SSE2-NEXT: movdqa %xmm1, %xmm10
1420 ; SSE2-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,3],xmm4[2,0]
1421 ; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,1],xmm2[3,3]
1422 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0]
1423 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[2,0]
1424 ; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,1],xmm3[3,3]
1425 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[0,0]
1426 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[2,0]
1427 ; SSE2-NEXT: movups %xmm10, 16(%rsi)
1428 ; SSE2-NEXT: movups %xmm8, (%rsi)
1429 ; SSE2-NEXT: movups %xmm1, 16(%rdx)
1430 ; SSE2-NEXT: movups %xmm0, (%rdx)
1431 ; SSE2-NEXT: movups %xmm9, 16(%rcx)
1432 ; SSE2-NEXT: movups %xmm5, (%rcx)
1435 ; SSE42-LABEL: interleave_24i32_out:
1437 ; SSE42-NEXT: movups 80(%rdi), %xmm0
1438 ; SSE42-NEXT: movdqu 64(%rdi), %xmm1
1439 ; SSE42-NEXT: movdqu (%rdi), %xmm4
1440 ; SSE42-NEXT: movdqu 16(%rdi), %xmm2
1441 ; SSE42-NEXT: movups 32(%rdi), %xmm3
1442 ; SSE42-NEXT: movdqu 48(%rdi), %xmm5
1443 ; SSE42-NEXT: movdqa %xmm2, %xmm6
1444 ; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7]
1445 ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3]
1446 ; SSE42-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[2,3]
1447 ; SSE42-NEXT: insertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm3[1]
1448 ; SSE42-NEXT: movdqa %xmm1, %xmm8
1449 ; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,3],xmm8[4,5,6,7]
1450 ; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,3,2,3]
1451 ; SSE42-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm1[2,3]
1452 ; SSE42-NEXT: insertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm0[1]
1453 ; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm3[2,2,2,2]
1454 ; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,0,3,3]
1455 ; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm10[6,7]
1456 ; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,0,3,3]
1457 ; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,2,2,2]
1458 ; SSE42-NEXT: pblendw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,5],xmm10[6,7]
1459 ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm2[2,3],xmm7[4,5,6,7]
1460 ; SSE42-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm3[0,3]
1461 ; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm9[0,1],xmm1[2,3],xmm9[4,5,6,7]
1462 ; SSE42-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,3]
1463 ; SSE42-NEXT: movups %xmm5, 16(%rsi)
1464 ; SSE42-NEXT: movups %xmm4, (%rsi)
1465 ; SSE42-NEXT: movdqu %xmm10, 16(%rdx)
1466 ; SSE42-NEXT: movdqu %xmm6, (%rdx)
1467 ; SSE42-NEXT: movups %xmm9, 16(%rcx)
1468 ; SSE42-NEXT: movups %xmm7, (%rcx)
1471 ; AVX1-LABEL: interleave_24i32_out:
1473 ; AVX1-NEXT: vmovups 64(%rdi), %ymm0
1474 ; AVX1-NEXT: vmovups 32(%rdi), %ymm1
1475 ; AVX1-NEXT: vmovups (%rdi), %ymm2
1476 ; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
1477 ; AVX1-NEXT: vmovups 16(%rdi), %xmm4
1478 ; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1],ymm1[1,3],ymm4[6,5],ymm1[5,7]
1479 ; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm4[0,2],ymm3[4,7],ymm4[4,6]
1480 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,0,1]
1481 ; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,0],ymm4[2,0],ymm0[5,4],ymm4[6,4]
1482 ; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4]
1483 ; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
1484 ; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm0[2,0],ymm4[3,0],ymm0[6,4],ymm4[7,4]
1485 ; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm4[0,0],ymm5[2,0],ymm4[4,4],ymm5[6,4]
1486 ; AVX1-NEXT: vmovups 16(%rdi), %xmm6
1487 ; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
1488 ; AVX1-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm6[0,3],ymm7[5,6],ymm6[4,7]
1489 ; AVX1-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5]
1490 ; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7]
1491 ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
1492 ; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm2[2,0],ymm6[5,4],ymm2[6,4]
1493 ; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[0,3],ymm2[6,4],ymm1[4,7]
1494 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,1],ymm0[0,3],ymm4[4,5],ymm0[4,7]
1495 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
1496 ; AVX1-NEXT: vmovups %ymm3, (%rsi)
1497 ; AVX1-NEXT: vmovups %ymm5, (%rdx)
1498 ; AVX1-NEXT: vmovups %ymm0, (%rcx)
1499 ; AVX1-NEXT: vzeroupper
1502 ; AVX2-SLOW-LABEL: interleave_24i32_out:
1503 ; AVX2-SLOW: # %bb.0:
1504 ; AVX2-SLOW-NEXT: vmovups (%rdi), %ymm0
1505 ; AVX2-SLOW-NEXT: vmovups 32(%rdi), %ymm1
1506 ; AVX2-SLOW-NEXT: vmovups 64(%rdi), %ymm2
1507 ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5]
1508 ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm3, %ymm3
1509 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
1510 ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u>
1511 ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm5, %ymm4
1512 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
1513 ; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6]
1514 ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1]
1515 ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm4, %ymm4
1516 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1517 ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u>
1518 ; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm6, %ymm5
1519 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
1520 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
1521 ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
1522 ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm0
1523 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7]
1524 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
1525 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
1526 ; AVX2-SLOW-NEXT: vmovups %ymm3, (%rsi)
1527 ; AVX2-SLOW-NEXT: vmovups %ymm4, (%rdx)
1528 ; AVX2-SLOW-NEXT: vmovups %ymm0, (%rcx)
1529 ; AVX2-SLOW-NEXT: vzeroupper
1530 ; AVX2-SLOW-NEXT: retq
1532 ; AVX2-FAST-ALL-LABEL: interleave_24i32_out:
1533 ; AVX2-FAST-ALL: # %bb.0:
1534 ; AVX2-FAST-ALL-NEXT: vmovups (%rdi), %ymm0
1535 ; AVX2-FAST-ALL-NEXT: vmovups 32(%rdi), %ymm1
1536 ; AVX2-FAST-ALL-NEXT: vmovups 64(%rdi), %ymm2
1537 ; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5]
1538 ; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm3, %ymm3
1539 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
1540 ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u>
1541 ; AVX2-FAST-ALL-NEXT: vpermps %ymm4, %ymm5, %ymm4
1542 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
1543 ; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6]
1544 ; AVX2-FAST-ALL-NEXT: # ymm4 = mem[0,1,0,1]
1545 ; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm4, %ymm4
1546 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1547 ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u>
1548 ; AVX2-FAST-ALL-NEXT: vpermps %ymm5, %ymm6, %ymm5
1549 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
1550 ; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,1,4,7,0,1,4,7]
1551 ; AVX2-FAST-ALL-NEXT: # ymm5 = mem[0,1,0,1]
1552 ; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm5, %ymm2
1553 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
1554 ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
1555 ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
1556 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
1557 ; AVX2-FAST-ALL-NEXT: vmovups %ymm3, (%rsi)
1558 ; AVX2-FAST-ALL-NEXT: vmovups %ymm4, (%rdx)
1559 ; AVX2-FAST-ALL-NEXT: vmovups %ymm0, (%rcx)
1560 ; AVX2-FAST-ALL-NEXT: vzeroupper
1561 ; AVX2-FAST-ALL-NEXT: retq
1563 ; AVX2-FAST-PERLANE-LABEL: interleave_24i32_out:
1564 ; AVX2-FAST-PERLANE: # %bb.0:
1565 ; AVX2-FAST-PERLANE-NEXT: vmovups (%rdi), %ymm0
1566 ; AVX2-FAST-PERLANE-NEXT: vmovups 32(%rdi), %ymm1
1567 ; AVX2-FAST-PERLANE-NEXT: vmovups 64(%rdi), %ymm2
1568 ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5]
1569 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm3, %ymm3
1570 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
1571 ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u>
1572 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm5, %ymm4
1573 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
1574 ; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6]
1575 ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1]
1576 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm4, %ymm4
1577 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1578 ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u>
1579 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm6, %ymm5
1580 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
1581 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
1582 ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
1583 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm0
1584 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7]
1585 ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
1586 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
1587 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, (%rsi)
1588 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, (%rdx)
1589 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rcx)
1590 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
1591 ; AVX2-FAST-PERLANE-NEXT: retq
1593 ; XOP-LABEL: interleave_24i32_out:
1595 ; XOP-NEXT: vmovups 64(%rdi), %ymm0
1596 ; XOP-NEXT: vmovups 32(%rdi), %ymm1
1597 ; XOP-NEXT: vmovups (%rdi), %ymm2
1598 ; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
1599 ; XOP-NEXT: vmovups 16(%rdi), %xmm4
1600 ; XOP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1],ymm1[1,3],ymm4[6,5],ymm1[5,7]
1601 ; XOP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm4[0,2],ymm3[4,7],ymm4[4,6]
1602 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,0,1]
1603 ; XOP-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,0],ymm4[2,0],ymm0[5,4],ymm4[6,4]
1604 ; XOP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4]
1605 ; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
1606 ; XOP-NEXT: vshufps {{.*#+}} ymm5 = ymm0[2,0],ymm4[3,0],ymm0[6,4],ymm4[7,4]
1607 ; XOP-NEXT: vshufps {{.*#+}} ymm5 = ymm4[0,0],ymm5[2,0],ymm4[4,4],ymm5[6,4]
1608 ; XOP-NEXT: vmovups 16(%rdi), %xmm6
1609 ; XOP-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
1610 ; XOP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm6[0,3],ymm7[5,6],ymm6[4,7]
1611 ; XOP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5]
1612 ; XOP-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7]
1613 ; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
1614 ; XOP-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm2[2,0],ymm6[5,4],ymm2[6,4]
1615 ; XOP-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[0,3],ymm2[6,4],ymm1[4,7]
1616 ; XOP-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,1],ymm0[0,3],ymm4[4,5],ymm0[4,7]
1617 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
1618 ; XOP-NEXT: vmovups %ymm3, (%rsi)
1619 ; XOP-NEXT: vmovups %ymm5, (%rdx)
1620 ; XOP-NEXT: vmovups %ymm0, (%rcx)
1621 ; XOP-NEXT: vzeroupper
1623 %wide.vec = load <24 x i32>, ptr %p, align 4
1624 %s1 = shufflevector <24 x i32> %wide.vec, <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
1625 %s2 = shufflevector <24 x i32> %wide.vec, <24 x i32> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
1626 %s3 = shufflevector <24 x i32> %wide.vec, <24 x i32> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
1627 store <8 x i32> %s1, ptr %q1, align 4
1628 store <8 x i32> %s2, ptr %q2, align 4
1629 store <8 x i32> %s3, ptr %q3, align 4
1633 define void @interleave_24i32_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
1634 ; SSE2-LABEL: interleave_24i32_in:
1636 ; SSE2-NEXT: movups (%rsi), %xmm1
1637 ; SSE2-NEXT: movups 16(%rsi), %xmm0
1638 ; SSE2-NEXT: movups (%rdx), %xmm3
1639 ; SSE2-NEXT: movups 16(%rdx), %xmm5
1640 ; SSE2-NEXT: movups (%rcx), %xmm4
1641 ; SSE2-NEXT: movups 16(%rcx), %xmm7
1642 ; SSE2-NEXT: movaps %xmm4, %xmm6
1643 ; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[1,3]
1644 ; SSE2-NEXT: movaps %xmm1, %xmm2
1645 ; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1646 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm6[0,2]
1647 ; SSE2-NEXT: movaps %xmm0, %xmm8
1648 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm5[1]
1649 ; SSE2-NEXT: movaps %xmm7, %xmm9
1650 ; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[1,3]
1651 ; SSE2-NEXT: movaps %xmm0, %xmm6
1652 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[3,3]
1653 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm7[2,3]
1654 ; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm5[1,1]
1655 ; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm8[0,2]
1656 ; SSE2-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
1657 ; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm9[0,2]
1658 ; SSE2-NEXT: movaps %xmm1, %xmm5
1659 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1]
1660 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm3[3,3]
1661 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm4[2,3]
1662 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1]
1663 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm5[0,2]
1664 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
1665 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
1666 ; SSE2-NEXT: movups %xmm4, 16(%rdi)
1667 ; SSE2-NEXT: movups %xmm6, 48(%rdi)
1668 ; SSE2-NEXT: movups %xmm7, 64(%rdi)
1669 ; SSE2-NEXT: movups %xmm2, (%rdi)
1670 ; SSE2-NEXT: movups %xmm1, 32(%rdi)
1671 ; SSE2-NEXT: movups %xmm0, 80(%rdi)
1674 ; SSE42-LABEL: interleave_24i32_in:
1676 ; SSE42-NEXT: movdqu (%rsi), %xmm0
1677 ; SSE42-NEXT: movdqu 16(%rsi), %xmm2
1678 ; SSE42-NEXT: movdqu (%rdx), %xmm3
1679 ; SSE42-NEXT: movdqu 16(%rdx), %xmm4
1680 ; SSE42-NEXT: movdqu (%rcx), %xmm5
1681 ; SSE42-NEXT: movdqu 16(%rcx), %xmm6
1682 ; SSE42-NEXT: movdqa %xmm0, %xmm1
1683 ; SSE42-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
1684 ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,2,2]
1685 ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,1]
1686 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm1[4,5],xmm7[6,7]
1687 ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,2,2]
1688 ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4,5],xmm7[6,7]
1689 ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7]
1690 ; SSE42-NEXT: movdqa %xmm2, %xmm8
1691 ; SSE42-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
1692 ; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,2]
1693 ; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,1,0,1]
1694 ; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm8[0,1,2,3],xmm9[4,5],xmm8[6,7]
1695 ; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,2,2]
1696 ; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm0[4,5],xmm8[6,7]
1697 ; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,3],xmm8[4,5,6,7]
1698 ; SSE42-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3]
1699 ; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,2,3]
1700 ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3,4,5],xmm4[6,7]
1701 ; SSE42-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3]
1702 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3]
1703 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7]
1704 ; SSE42-NEXT: movdqu %xmm2, 32(%rdi)
1705 ; SSE42-NEXT: movdqu %xmm4, 80(%rdi)
1706 ; SSE42-NEXT: movdqu %xmm8, 16(%rdi)
1707 ; SSE42-NEXT: movdqu %xmm9, 48(%rdi)
1708 ; SSE42-NEXT: movdqu %xmm7, 64(%rdi)
1709 ; SSE42-NEXT: movdqu %xmm1, (%rdi)
1712 ; AVX1-LABEL: interleave_24i32_in:
1714 ; AVX1-NEXT: vmovupd (%rcx), %ymm0
1715 ; AVX1-NEXT: vmovups (%rdx), %xmm1
1716 ; AVX1-NEXT: vmovups 16(%rdx), %xmm2
1717 ; AVX1-NEXT: vmovups (%rsi), %xmm3
1718 ; AVX1-NEXT: vmovups 16(%rsi), %xmm4
1719 ; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,3],xmm2[3,3]
1720 ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
1721 ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1],xmm4[0,2]
1722 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
1723 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,2,3]
1724 ; AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0,0,3,3]
1725 ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7]
1726 ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm3[1],xmm1[1]
1727 ; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm1[1,1],xmm4[0,2]
1728 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
1729 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,1]
1730 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
1731 ; AVX1-NEXT: vbroadcastsd (%rcx), %ymm3
1732 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7]
1733 ; AVX1-NEXT: vpermilps {{.*#+}} ymm3 = mem[0,0,3,3,4,4,7,7]
1734 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm4 = mem[1,0,2,2]
1735 ; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
1736 ; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
1737 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7]
1738 ; AVX1-NEXT: vmovups %ymm0, 32(%rdi)
1739 ; AVX1-NEXT: vmovups %ymm1, (%rdi)
1740 ; AVX1-NEXT: vmovups %ymm2, 64(%rdi)
1741 ; AVX1-NEXT: vzeroupper
1744 ; AVX2-SLOW-LABEL: interleave_24i32_in:
1745 ; AVX2-SLOW: # %bb.0:
1746 ; AVX2-SLOW-NEXT: vmovups (%rsi), %ymm0
1747 ; AVX2-SLOW-NEXT: vmovups (%rdx), %ymm1
1748 ; AVX2-SLOW-NEXT: vmovups (%rcx), %ymm2
1749 ; AVX2-SLOW-NEXT: vbroadcastsd 24(%rsi), %ymm3
1750 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm1[1,2,3,3,5,6,7,7]
1751 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,2,2,3]
1752 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
1753 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3]
1754 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
1755 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = mem[1,0,2,2]
1756 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,1]
1757 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,0,2,1]
1758 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7]
1759 ; AVX2-SLOW-NEXT: vbroadcastsd (%rcx), %ymm5
1760 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
1761 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
1762 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
1763 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1764 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[1,1,2,2]
1765 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
1766 ; AVX2-SLOW-NEXT: vmovups %ymm0, 32(%rdi)
1767 ; AVX2-SLOW-NEXT: vmovups %ymm4, (%rdi)
1768 ; AVX2-SLOW-NEXT: vmovups %ymm3, 64(%rdi)
1769 ; AVX2-SLOW-NEXT: vzeroupper
1770 ; AVX2-SLOW-NEXT: retq
1772 ; AVX2-FAST-ALL-LABEL: interleave_24i32_in:
1773 ; AVX2-FAST-ALL: # %bb.0:
1774 ; AVX2-FAST-ALL-NEXT: vmovups (%rsi), %ymm0
1775 ; AVX2-FAST-ALL-NEXT: vmovups (%rdx), %ymm1
1776 ; AVX2-FAST-ALL-NEXT: vmovups (%rcx), %ymm2
1777 ; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [5,0,7,6,5,0,7,6]
1778 ; AVX2-FAST-ALL-NEXT: # ymm3 = mem[0,1,0,1]
1779 ; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm3, %ymm3
1780 ; AVX2-FAST-ALL-NEXT: vbroadcastsd 24(%rsi), %ymm4
1781 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
1782 ; AVX2-FAST-ALL-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3]
1783 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
1784 ; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [1,0,2,2,1,0,2,2]
1785 ; AVX2-FAST-ALL-NEXT: # ymm4 = mem[0,1,0,1]
1786 ; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm4, %ymm4
1787 ; AVX2-FAST-ALL-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,0,2,1]
1788 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7]
1789 ; AVX2-FAST-ALL-NEXT: vbroadcastsd (%rcx), %ymm5
1790 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
1791 ; AVX2-FAST-ALL-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
1792 ; AVX2-FAST-ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
1793 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1794 ; AVX2-FAST-ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[1,1,2,2]
1795 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
1796 ; AVX2-FAST-ALL-NEXT: vmovups %ymm0, 32(%rdi)
1797 ; AVX2-FAST-ALL-NEXT: vmovups %ymm4, (%rdi)
1798 ; AVX2-FAST-ALL-NEXT: vmovups %ymm3, 64(%rdi)
1799 ; AVX2-FAST-ALL-NEXT: vzeroupper
1800 ; AVX2-FAST-ALL-NEXT: retq
1802 ; AVX2-FAST-PERLANE-LABEL: interleave_24i32_in:
1803 ; AVX2-FAST-PERLANE: # %bb.0:
1804 ; AVX2-FAST-PERLANE-NEXT: vmovups (%rsi), %ymm0
1805 ; AVX2-FAST-PERLANE-NEXT: vmovups (%rdx), %ymm1
1806 ; AVX2-FAST-PERLANE-NEXT: vmovups (%rcx), %ymm2
1807 ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rsi), %ymm3
1808 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm1[1,2,3,3,5,6,7,7]
1809 ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,2,2,3]
1810 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
1811 ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3]
1812 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
1813 ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = mem[1,0,2,2]
1814 ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,1]
1815 ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,0,2,1]
1816 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7]
1817 ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd (%rcx), %ymm5
1818 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
1819 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
1820 ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
1821 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1822 ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[1,1,2,2]
1823 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
1824 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, 32(%rdi)
1825 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, (%rdi)
1826 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, 64(%rdi)
1827 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
1828 ; AVX2-FAST-PERLANE-NEXT: retq
1830 ; XOP-LABEL: interleave_24i32_in:
1832 ; XOP-NEXT: vmovups (%rsi), %ymm0
1833 ; XOP-NEXT: vmovups (%rdx), %ymm1
1834 ; XOP-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm0[u,3],ymm1[3],ymm0[u,4],ymm1[4],ymm0[u,5]
1835 ; XOP-NEXT: vmovups (%rcx), %ymm1
1836 ; XOP-NEXT: vmovups (%rdx), %xmm2
1837 ; XOP-NEXT: vmovups 16(%rdx), %xmm3
1838 ; XOP-NEXT: vmovups (%rsi), %xmm4
1839 ; XOP-NEXT: vmovups 16(%rsi), %xmm5
1840 ; XOP-NEXT: vshufps {{.*#+}} xmm6 = xmm5[3,3],xmm3[3,3]
1841 ; XOP-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1]
1842 ; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1],xmm5[0,2]
1843 ; XOP-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3
1844 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3,2,3]
1845 ; XOP-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0,0,3,3]
1846 ; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7]
1847 ; XOP-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm4[1],xmm2[1]
1848 ; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm2[1,1],xmm5[0,2]
1849 ; XOP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0]
1850 ; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,1]
1851 ; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
1852 ; XOP-NEXT: vbroadcastsd (%rcx), %ymm4
1853 ; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7]
1854 ; XOP-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1,1,2,2]
1855 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
1856 ; XOP-NEXT: vmovups %ymm0, 32(%rdi)
1857 ; XOP-NEXT: vmovups %ymm2, (%rdi)
1858 ; XOP-NEXT: vmovups %ymm3, 64(%rdi)
1859 ; XOP-NEXT: vzeroupper
1861 %s1 = load <8 x i32>, ptr %q1, align 4
1862 %s2 = load <8 x i32>, ptr %q2, align 4
1863 %s3 = load <8 x i32>, ptr %q3, align 4
1864 %t1 = shufflevector <8 x i32> %s1, <8 x i32> %s2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1865 %t2 = shufflevector <8 x i32> %s3, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1866 %interleaved = shufflevector <16 x i32> %t1, <16 x i32> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
1867 store <24 x i32> %interleaved, ptr %p, align 4
1871 ; Repeat each element x 3 of <16 x i8> a0 + a1 to create a <96 x i8>.
1872 define void @splat3_128(<16 x i8> %a0, <16 x i8> %a1, ptr%a2) {
1873 ; SSE2-LABEL: splat3_128:
1875 ; SSE2-NEXT: pxor %xmm4, %xmm4
1876 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1877 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1878 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[2,3,3,3,4,5,6,7]
1879 ; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,4,5]
1880 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,1]
1881 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,1,4,5,6,7]
1882 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,6]
1883 ; SSE2-NEXT: packuswb %xmm5, %xmm2
1884 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
1885 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,0,1]
1886 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,0,0,1,4,5,6,7]
1887 ; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,6,6]
1888 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
1889 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,2,2,4,5,6,7]
1890 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,7,7,7]
1891 ; SSE2-NEXT: packuswb %xmm5, %xmm3
1892 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[2,3,3,3,4,5,6,7]
1893 ; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,5]
1894 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1895 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,2,4,5,6,7]
1896 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,7,7]
1897 ; SSE2-NEXT: packuswb %xmm0, %xmm5
1898 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1899 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1900 ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[2,3,3,3,4,5,6,7]
1901 ; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,5]
1902 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,1,0,1]
1903 ; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,0,0,1,4,5,6,7]
1904 ; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,6,6]
1905 ; SSE2-NEXT: packuswb %xmm6, %xmm7
1906 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
1907 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,0,1]
1908 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,1,4,5,6,7]
1909 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,6,6]
1910 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1911 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,2,4,5,6,7]
1912 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,7,7]
1913 ; SSE2-NEXT: packuswb %xmm4, %xmm0
1914 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,3,3,3,4,5,6,7]
1915 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,5]
1916 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1917 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,2,2,4,5,6,7]
1918 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,7,7]
1919 ; SSE2-NEXT: packuswb %xmm1, %xmm4
1920 ; SSE2-NEXT: movdqa %xmm4, 80(%rdi)
1921 ; SSE2-NEXT: movdqa %xmm0, 64(%rdi)
1922 ; SSE2-NEXT: movdqa %xmm7, 48(%rdi)
1923 ; SSE2-NEXT: movdqa %xmm5, 32(%rdi)
1924 ; SSE2-NEXT: movdqa %xmm3, 16(%rdi)
1925 ; SSE2-NEXT: movdqa %xmm2, (%rdi)
1928 ; SSE42-LABEL: splat3_128:
1930 ; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
1931 ; SSE42-NEXT: movdqa %xmm0, %xmm3
1932 ; SSE42-NEXT: pshufb %xmm2, %xmm3
1933 ; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10]
1934 ; SSE42-NEXT: movdqa %xmm0, %xmm5
1935 ; SSE42-NEXT: pshufb %xmm4, %xmm5
1936 ; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15]
1937 ; SSE42-NEXT: pshufb %xmm6, %xmm0
1938 ; SSE42-NEXT: movdqa %xmm1, %xmm7
1939 ; SSE42-NEXT: pshufb %xmm2, %xmm7
1940 ; SSE42-NEXT: movdqa %xmm1, %xmm2
1941 ; SSE42-NEXT: pshufb %xmm4, %xmm2
1942 ; SSE42-NEXT: pshufb %xmm6, %xmm1
1943 ; SSE42-NEXT: movdqa %xmm1, 80(%rdi)
1944 ; SSE42-NEXT: movdqa %xmm2, 64(%rdi)
1945 ; SSE42-NEXT: movdqa %xmm7, 48(%rdi)
1946 ; SSE42-NEXT: movdqa %xmm0, 32(%rdi)
1947 ; SSE42-NEXT: movdqa %xmm5, 16(%rdi)
1948 ; SSE42-NEXT: movdqa %xmm3, (%rdi)
1951 ; AVX1-LABEL: splat3_128:
1953 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
1954 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
1955 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
1956 ; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm0[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
1957 ; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
1958 ; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
1959 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
1960 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
1961 ; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
1962 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
1963 ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
1964 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
1965 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
1966 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
1967 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4]
1968 ; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4]
1969 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1970 ; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3
1971 ; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0
1972 ; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5
1973 ; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1
1974 ; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2
1975 ; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4
1976 ; AVX1-NEXT: vmovdqa %xmm4, 80(%rdi)
1977 ; AVX1-NEXT: vmovdqa %xmm2, 64(%rdi)
1978 ; AVX1-NEXT: vmovdqa %xmm1, 48(%rdi)
1979 ; AVX1-NEXT: vmovdqa %xmm5, 32(%rdi)
1980 ; AVX1-NEXT: vmovdqa %xmm3, 16(%rdi)
1981 ; AVX1-NEXT: vmovdqa %xmm0, (%rdi)
1984 ; AVX2-LABEL: splat3_128:
1986 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1987 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1988 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
1989 ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm0[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
1990 ; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
1991 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
1992 ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
1993 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
1994 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
1995 ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
1996 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
1997 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1998 ; AVX2-NEXT: # ymm4 = mem[0,1,0,1]
1999 ; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3
2000 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
2001 ; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0
2002 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
2003 ; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1
2004 ; AVX2-NEXT: vmovdqa %ymm1, 64(%rdi)
2005 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdi)
2006 ; AVX2-NEXT: vmovdqa %ymm3, (%rdi)
2007 ; AVX2-NEXT: vzeroupper
2010 ; XOP-LABEL: splat3_128:
2012 ; XOP-NEXT: vpalignr {{.*#+}} xmm2 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
2013 ; XOP-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
2014 ; XOP-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
2015 ; XOP-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
2016 ; XOP-NEXT: vpalignr {{.*#+}} xmm6 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
2017 ; XOP-NEXT: vpalignr {{.*#+}} xmm7 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
2018 ; XOP-NEXT: vpalignr {{.*#+}} xmm3 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
2019 ; XOP-NEXT: vpalignr {{.*#+}} xmm2 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
2020 ; XOP-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
2021 ; XOP-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
2022 ; XOP-NEXT: vmovdqa {{.*#+}} xmm8 = [5,16,11,6,17,12,7,18,13,8,19,14,9,20,15,10]
2023 ; XOP-NEXT: vpperm %xmm8, %xmm4, %xmm2, %xmm2
2024 ; XOP-NEXT: vpperm %xmm8, %xmm0, %xmm7, %xmm0
2025 ; XOP-NEXT: vpperm %xmm8, %xmm7, %xmm4, %xmm4
2026 ; XOP-NEXT: vpperm %xmm8, %xmm1, %xmm6, %xmm1
2027 ; XOP-NEXT: vpperm %xmm8, %xmm5, %xmm3, %xmm3
2028 ; XOP-NEXT: vpperm %xmm8, %xmm6, %xmm5, %xmm5
2029 ; XOP-NEXT: vmovdqa %xmm5, 80(%rdi)
2030 ; XOP-NEXT: vmovdqa %xmm3, 64(%rdi)
2031 ; XOP-NEXT: vmovdqa %xmm1, 48(%rdi)
2032 ; XOP-NEXT: vmovdqa %xmm4, 32(%rdi)
2033 ; XOP-NEXT: vmovdqa %xmm2, 16(%rdi)
2034 ; XOP-NEXT: vmovdqa %xmm0, (%rdi)
2036 %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2037 %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2038 %3 = shufflevector <64 x i8> %1, <64 x i8> %2, <96 x i32> <i32 0, i32 32, i32 64, i32 1, i32 33, i32 65, i32 2, i32 34, i32 66, i32 3, i32 35, i32 67, i32 4, i32 36, i32 68, i32 5, i32 37, i32 69, i32 6, i32 38, i32 70, i32 7, i32 39, i32 71, i32 8, i32 40, i32 72, i32 9, i32 41, i32 73, i32 10, i32 42, i32 74, i32 11, i32 43, i32 75, i32 12, i32 44, i32 76, i32 13, i32 45, i32 77, i32 14, i32 46, i32 78, i32 15, i32 47, i32 79, i32 16, i32 48, i32 80, i32 17, i32 49, i32 81, i32 18, i32 50, i32 82, i32 19, i32 51, i32 83, i32 20, i32 52, i32 84, i32 21, i32 53, i32 85, i32 22, i32 54, i32 86, i32 23, i32 55, i32 87, i32 24, i32 56, i32 88, i32 25, i32 57, i32 89, i32 26, i32 58, i32 90, i32 27, i32 59, i32 91, i32 28, i32 60, i32 92, i32 29, i32 61, i32 93, i32 30, i32 62, i32 94, i32 31, i32 63, i32 95>
2039 store <96 x i8> %3, ptr %a2
2043 ; Repeat each element x 3 of <32 x i8> a0 to create a <96 x i8>.
2044 define void @splat3_256(<32 x i8> %a0, ptr%a1) {
2045 ; SSE2-LABEL: splat3_256:
2047 ; SSE2-NEXT: pxor %xmm4, %xmm4
2048 ; SSE2-NEXT: movdqa %xmm0, %xmm3
2049 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
2050 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[2,3,3,3,4,5,6,7]
2051 ; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,4,5]
2052 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,1]
2053 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,1,4,5,6,7]
2054 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,6]
2055 ; SSE2-NEXT: packuswb %xmm5, %xmm2
2056 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
2057 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,0,1]
2058 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,0,0,1,4,5,6,7]
2059 ; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,6,6]
2060 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
2061 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,2,2,4,5,6,7]
2062 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,7,7,7]
2063 ; SSE2-NEXT: packuswb %xmm5, %xmm3
2064 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[2,3,3,3,4,5,6,7]
2065 ; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,5]
2066 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
2067 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,2,4,5,6,7]
2068 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,7,7]
2069 ; SSE2-NEXT: packuswb %xmm0, %xmm5
2070 ; SSE2-NEXT: movdqa %xmm1, %xmm0
2071 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
2072 ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[2,3,3,3,4,5,6,7]
2073 ; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,5]
2074 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,1,0,1]
2075 ; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,0,0,1,4,5,6,7]
2076 ; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,6,6]
2077 ; SSE2-NEXT: packuswb %xmm6, %xmm7
2078 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
2079 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,0,1]
2080 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,1,4,5,6,7]
2081 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,6,6]
2082 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
2083 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,2,4,5,6,7]
2084 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,7,7]
2085 ; SSE2-NEXT: packuswb %xmm4, %xmm0
2086 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,3,3,3,4,5,6,7]
2087 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,5]
2088 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
2089 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,2,2,4,5,6,7]
2090 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,7,7]
2091 ; SSE2-NEXT: packuswb %xmm1, %xmm4
2092 ; SSE2-NEXT: movdqa %xmm4, 80(%rdi)
2093 ; SSE2-NEXT: movdqa %xmm0, 64(%rdi)
2094 ; SSE2-NEXT: movdqa %xmm7, 48(%rdi)
2095 ; SSE2-NEXT: movdqa %xmm5, 32(%rdi)
2096 ; SSE2-NEXT: movdqa %xmm3, 16(%rdi)
2097 ; SSE2-NEXT: movdqa %xmm2, (%rdi)
2100 ; SSE42-LABEL: splat3_256:
2102 ; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
2103 ; SSE42-NEXT: movdqa %xmm0, %xmm3
2104 ; SSE42-NEXT: pshufb %xmm2, %xmm3
2105 ; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10]
2106 ; SSE42-NEXT: movdqa %xmm0, %xmm5
2107 ; SSE42-NEXT: pshufb %xmm4, %xmm5
2108 ; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15]
2109 ; SSE42-NEXT: pshufb %xmm6, %xmm0
2110 ; SSE42-NEXT: movdqa %xmm1, %xmm7
2111 ; SSE42-NEXT: pshufb %xmm2, %xmm7
2112 ; SSE42-NEXT: movdqa %xmm1, %xmm2
2113 ; SSE42-NEXT: pshufb %xmm4, %xmm2
2114 ; SSE42-NEXT: pshufb %xmm6, %xmm1
2115 ; SSE42-NEXT: movdqa %xmm1, 80(%rdi)
2116 ; SSE42-NEXT: movdqa %xmm2, 64(%rdi)
2117 ; SSE42-NEXT: movdqa %xmm7, 48(%rdi)
2118 ; SSE42-NEXT: movdqa %xmm0, 32(%rdi)
2119 ; SSE42-NEXT: movdqa %xmm5, 16(%rdi)
2120 ; SSE42-NEXT: movdqa %xmm3, (%rdi)
2123 ; AVX1-LABEL: splat3_256:
2125 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2126 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
2127 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
2128 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
2129 ; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm0[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
2130 ; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
2131 ; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
2132 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
2133 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
2134 ; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
2135 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
2136 ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
2137 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
2138 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
2139 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
2140 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4]
2141 ; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4]
2142 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
2143 ; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3
2144 ; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0
2145 ; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5
2146 ; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1
2147 ; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2
2148 ; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4
2149 ; AVX1-NEXT: vmovdqa %xmm4, 80(%rdi)
2150 ; AVX1-NEXT: vmovdqa %xmm2, 64(%rdi)
2151 ; AVX1-NEXT: vmovdqa %xmm1, 48(%rdi)
2152 ; AVX1-NEXT: vmovdqa %xmm5, 32(%rdi)
2153 ; AVX1-NEXT: vmovdqa %xmm3, 16(%rdi)
2154 ; AVX1-NEXT: vmovdqa %xmm0, (%rdi)
2155 ; AVX1-NEXT: vzeroupper
2158 ; AVX2-LABEL: splat3_256:
2160 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
2161 ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm0[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
2162 ; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
2163 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
2164 ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
2165 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
2166 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
2167 ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
2168 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
2169 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
2170 ; AVX2-NEXT: # ymm4 = mem[0,1,0,1]
2171 ; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3
2172 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
2173 ; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0
2174 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
2175 ; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1
2176 ; AVX2-NEXT: vmovdqa %ymm1, 64(%rdi)
2177 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdi)
2178 ; AVX2-NEXT: vmovdqa %ymm3, (%rdi)
2179 ; AVX2-NEXT: vzeroupper
2182 ; XOP-LABEL: splat3_256:
2184 ; XOP-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
2185 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2
2186 ; XOP-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
2187 ; XOP-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
2188 ; XOP-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
2189 ; XOP-NEXT: vpalignr {{.*#+}} xmm6 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
2190 ; XOP-NEXT: vpalignr {{.*#+}} xmm7 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
2191 ; XOP-NEXT: vpalignr {{.*#+}} xmm3 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
2192 ; XOP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
2193 ; XOP-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
2194 ; XOP-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
2195 ; XOP-NEXT: vmovdqa {{.*#+}} xmm8 = [5,16,11,6,17,12,7,18,13,8,19,14,9,20,15,10]
2196 ; XOP-NEXT: vpperm %xmm8, %xmm4, %xmm1, %xmm1
2197 ; XOP-NEXT: vpperm %xmm8, %xmm0, %xmm7, %xmm0
2198 ; XOP-NEXT: vpperm %xmm8, %xmm7, %xmm4, %xmm4
2199 ; XOP-NEXT: vpperm %xmm8, %xmm2, %xmm6, %xmm2
2200 ; XOP-NEXT: vpperm %xmm8, %xmm5, %xmm3, %xmm3
2201 ; XOP-NEXT: vpperm %xmm8, %xmm6, %xmm5, %xmm5
2202 ; XOP-NEXT: vmovdqa %xmm5, 80(%rdi)
2203 ; XOP-NEXT: vmovdqa %xmm3, 64(%rdi)
2204 ; XOP-NEXT: vmovdqa %xmm2, 48(%rdi)
2205 ; XOP-NEXT: vmovdqa %xmm4, 32(%rdi)
2206 ; XOP-NEXT: vmovdqa %xmm1, 16(%rdi)
2207 ; XOP-NEXT: vmovdqa %xmm0, (%rdi)
2208 ; XOP-NEXT: vzeroupper
2210 %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2211 %2 = shufflevector <32 x i8> %a0, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2212 %3 = shufflevector <64 x i8> %1, <64 x i8> %2, <96 x i32> <i32 0, i32 32, i32 64, i32 1, i32 33, i32 65, i32 2, i32 34, i32 66, i32 3, i32 35, i32 67, i32 4, i32 36, i32 68, i32 5, i32 37, i32 69, i32 6, i32 38, i32 70, i32 7, i32 39, i32 71, i32 8, i32 40, i32 72, i32 9, i32 41, i32 73, i32 10, i32 42, i32 74, i32 11, i32 43, i32 75, i32 12, i32 44, i32 76, i32 13, i32 45, i32 77, i32 14, i32 46, i32 78, i32 15, i32 47, i32 79, i32 16, i32 48, i32 80, i32 17, i32 49, i32 81, i32 18, i32 50, i32 82, i32 19, i32 51, i32 83, i32 20, i32 52, i32 84, i32 21, i32 53, i32 85, i32 22, i32 54, i32 86, i32 23, i32 55, i32 87, i32 24, i32 56, i32 88, i32 25, i32 57, i32 89, i32 26, i32 58, i32 90, i32 27, i32 59, i32 91, i32 28, i32 60, i32 92, i32 29, i32 61, i32 93, i32 30, i32 62, i32 94, i32 31, i32 63, i32 95>
2213 store <96 x i8> %3, ptr %a1
2218 define <16 x i32> @splat_v3i32(ptr %ptr) {
2219 ; SSE2-LABEL: splat_v3i32:
2221 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
2222 ; SSE2-NEXT: xorps %xmm1, %xmm1
2223 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2224 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,0,1]
2225 ; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2226 ; SSE2-NEXT: xorps %xmm1, %xmm1
2227 ; SSE2-NEXT: xorps %xmm3, %xmm3
2230 ; SSE42-LABEL: splat_v3i32:
2232 ; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
2233 ; SSE42-NEXT: pxor %xmm1, %xmm1
2234 ; SSE42-NEXT: pxor %xmm2, %xmm2
2235 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
2236 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
2237 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,0,1]
2238 ; SSE42-NEXT: xorps %xmm3, %xmm3
2241 ; AVX1-LABEL: splat_v3i32:
2243 ; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
2244 ; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
2245 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3,4,5,6,7]
2246 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7]
2249 ; AVX2-SLOW-LABEL: splat_v3i32:
2250 ; AVX2-SLOW: # %bb.0:
2251 ; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
2252 ; AVX2-SLOW-NEXT: vxorps %xmm2, %xmm2, %xmm2
2253 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3,4,5,6,7]
2254 ; AVX2-SLOW-NEXT: vbroadcastss %xmm1, %xmm1
2255 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7]
2256 ; AVX2-SLOW-NEXT: retq
2258 ; AVX2-FAST-LABEL: splat_v3i32:
2259 ; AVX2-FAST: # %bb.0:
2260 ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
2261 ; AVX2-FAST-NEXT: vpxor %xmm0, %xmm0, %xmm0
2262 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
2263 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2264 ; AVX2-FAST-NEXT: retq
2266 ; XOP-LABEL: splat_v3i32:
2268 ; XOP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
2269 ; XOP-NEXT: vxorps %xmm2, %xmm2, %xmm2
2270 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3,4,5,6,7]
2271 ; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7]
2273 %1 = load <3 x i32>, ptr %ptr, align 1
2274 %2 = shufflevector <3 x i32> %1, <3 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2275 %3 = shufflevector <16 x i32> <i32 0, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0>, <16 x i32> %2, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 11, i32 12, i32 13, i32 14, i32 15>
2279 define <2 x double> @wrongorder(<4 x double> %A, ptr %P) #0 {
2280 ; SSE2-LABEL: wrongorder:
2282 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
2283 ; SSE2-NEXT: movaps %xmm0, 48(%rdi)
2284 ; SSE2-NEXT: movaps %xmm0, 32(%rdi)
2285 ; SSE2-NEXT: movaps %xmm0, 16(%rdi)
2286 ; SSE2-NEXT: movaps %xmm0, (%rdi)
2289 ; SSE42-LABEL: wrongorder:
2291 ; SSE42-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
2292 ; SSE42-NEXT: movapd %xmm0, 48(%rdi)
2293 ; SSE42-NEXT: movapd %xmm0, 32(%rdi)
2294 ; SSE42-NEXT: movapd %xmm0, 16(%rdi)
2295 ; SSE42-NEXT: movapd %xmm0, (%rdi)
2298 ; AVX1-LABEL: wrongorder:
2300 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2301 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
2302 ; AVX1-NEXT: vmovaps %ymm1, 32(%rdi)
2303 ; AVX1-NEXT: vmovaps %ymm1, (%rdi)
2304 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2305 ; AVX1-NEXT: vzeroupper
2308 ; AVX2-LABEL: wrongorder:
2310 ; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0
2311 ; AVX2-NEXT: vmovaps %ymm0, 32(%rdi)
2312 ; AVX2-NEXT: vmovaps %ymm0, (%rdi)
2313 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2314 ; AVX2-NEXT: vzeroupper
2317 ; XOP-LABEL: wrongorder:
2319 ; XOP-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2320 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
2321 ; XOP-NEXT: vmovaps %ymm1, 32(%rdi)
2322 ; XOP-NEXT: vmovaps %ymm1, (%rdi)
2323 ; XOP-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2324 ; XOP-NEXT: vzeroupper
2326 %shuffle = shufflevector <4 x double> %A, <4 x double> %A, <8 x i32> zeroinitializer
2327 store <8 x double> %shuffle, ptr %P, align 64
2328 %m2 = load <8 x double>, ptr %P, align 64
2329 store <8 x double> %m2, ptr %P, align 64
2330 %m3 = load <8 x double>, ptr %P, align 64
2331 %m4 = shufflevector <8 x double> %m3, <8 x double> undef, <2 x i32> <i32 2, i32 0>
2332 ret <2 x double> %m4
2335 define void @PR41097() {
2336 ; SSE2-LABEL: PR41097:
2338 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2339 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2340 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
2341 ; SSE2-NEXT: psrad $24, %xmm0
2342 ; SSE2-NEXT: pxor %xmm1, %xmm1
2343 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2344 ; SSE2-NEXT: movdqu %xmm0, (%rax)
2347 ; SSE42-LABEL: PR41097:
2349 ; SSE42-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2350 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
2351 ; SSE42-NEXT: pmovsxbd %xmm0, %xmm0
2352 ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
2353 ; SSE42-NEXT: movdqu %xmm0, (%rax)
2356 ; AVX-LABEL: PR41097:
2358 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2359 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
2360 ; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
2361 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
2362 ; AVX-NEXT: vmovdqu %xmm0, (%rax)
2364 %wide.vec = load <6 x i8>, ptr undef, align 1
2365 %strided.vec = shufflevector <6 x i8> %wide.vec, <6 x i8> undef, <2 x i32> <i32 0, i32 3>
2366 %tmp = sext <2 x i8> %strided.vec to <2 x i32>
2367 %tmp7 = zext <2 x i32> %tmp to <2 x i64>
2368 store <2 x i64> %tmp7, ptr undef, align 8
2372 define void @D107009(ptr %input, ptr %output) {
2373 ; SSE-LABEL: D107009:
2375 ; SSE-NEXT: movdqa 16(%rdi), %xmm0
2376 ; SSE-NEXT: movdqa 80(%rdi), %xmm1
2377 ; SSE-NEXT: movdqa 144(%rdi), %xmm2
2378 ; SSE-NEXT: movdqa 208(%rdi), %xmm3
2379 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
2380 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2381 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2382 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1]
2383 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
2384 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2385 ; SSE-NEXT: psrld $16, %xmm2
2386 ; SSE-NEXT: psrld $16, %xmm0
2387 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
2388 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
2389 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
2390 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,1,1]
2391 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,2,3]
2392 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[3,3,3,3]
2393 ; SSE-NEXT: movdqa %xmm0, 128(%rsi)
2394 ; SSE-NEXT: movdqa %xmm2, 144(%rsi)
2395 ; SSE-NEXT: movdqa %xmm0, 16(%rsi)
2396 ; SSE-NEXT: movdqa %xmm7, 240(%rsi)
2397 ; SSE-NEXT: movdqa %xmm6, 208(%rsi)
2398 ; SSE-NEXT: movdqa %xmm5, 176(%rsi)
2399 ; SSE-NEXT: movdqa %xmm4, 112(%rsi)
2400 ; SSE-NEXT: movdqa %xmm3, 80(%rsi)
2401 ; SSE-NEXT: movdqa %xmm1, 48(%rsi)
2404 ; AVX1-LABEL: D107009:
2406 ; AVX1-NEXT: vmovups 96(%rdi), %ymm0
2407 ; AVX1-NEXT: vmovups (%rdi), %ymm1
2408 ; AVX1-NEXT: vmovups 128(%rdi), %ymm2
2409 ; AVX1-NEXT: vmovups 224(%rdi), %ymm3
2410 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],mem[0],ymm3[2],mem[2]
2411 ; AVX1-NEXT: vunpcklps {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5]
2412 ; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,0],ymm2[4,5],ymm3[6,4]
2413 ; AVX1-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5]
2414 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2415 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
2416 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
2417 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2418 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2419 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
2420 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
2421 ; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1
2422 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
2423 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
2424 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
2425 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
2426 ; AVX1-NEXT: vmovshdup {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7]
2427 ; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm1[3,3,3,3,7,7,7,7]
2428 ; AVX1-NEXT: vshufpd {{.*#+}} ymm5 = ymm1[0,0,3,2]
2429 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1]
2430 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3]
2431 ; AVX1-NEXT: vmovdqa %xmm0, 16(%rsi)
2432 ; AVX1-NEXT: vmovdqa %xmm7, 112(%rsi)
2433 ; AVX1-NEXT: vmovdqa %xmm6, 48(%rsi)
2434 ; AVX1-NEXT: vmovups %ymm1, 128(%rsi)
2435 ; AVX1-NEXT: vmovupd %ymm5, 192(%rsi)
2436 ; AVX1-NEXT: vmovups %ymm4, 224(%rsi)
2437 ; AVX1-NEXT: vmovups %ymm3, 160(%rsi)
2438 ; AVX1-NEXT: vmovups %ymm2, 64(%rsi)
2439 ; AVX1-NEXT: vzeroupper
2442 ; AVX2-LABEL: D107009:
2444 ; AVX2-NEXT: vmovdqu (%rdi), %ymm0
2445 ; AVX2-NEXT: vmovdqu 64(%rdi), %ymm1
2446 ; AVX2-NEXT: vmovdqu 128(%rdi), %ymm2
2447 ; AVX2-NEXT: vmovdqu 192(%rdi), %ymm3
2448 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[4],mem[4],ymm3[5],mem[5]
2449 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5]
2450 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
2451 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
2452 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
2453 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5]
2454 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
2455 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2456 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
2457 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
2458 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
2459 ; AVX2-NEXT: vpbroadcastd %xmm0, %ymm2
2460 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
2461 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2462 ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[1,1,1,1,5,5,5,5]
2463 ; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[3,3,3,3,7,7,7,7]
2464 ; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[2,3,2,3,6,7,6,7]
2465 ; AVX2-NEXT: vmovdqu %ymm0, 128(%rsi)
2466 ; AVX2-NEXT: vmovdqu %ymm7, 192(%rsi)
2467 ; AVX2-NEXT: vmovdqu %ymm6, 224(%rsi)
2468 ; AVX2-NEXT: vmovdqu %ymm5, 160(%rsi)
2469 ; AVX2-NEXT: vmovdqu %ymm4, 64(%rsi)
2470 ; AVX2-NEXT: vmovdqa %xmm3, 112(%rsi)
2471 ; AVX2-NEXT: vmovdqu %ymm2, (%rsi)
2472 ; AVX2-NEXT: vmovdqa %xmm1, 48(%rsi)
2473 ; AVX2-NEXT: vzeroupper
2476 ; XOP-LABEL: D107009:
2478 ; XOP-NEXT: vmovups 96(%rdi), %ymm0
2479 ; XOP-NEXT: vmovups (%rdi), %ymm1
2480 ; XOP-NEXT: vmovups 128(%rdi), %ymm2
2481 ; XOP-NEXT: vmovups 224(%rdi), %ymm3
2482 ; XOP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],mem[0],ymm3[2],mem[2]
2483 ; XOP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5]
2484 ; XOP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,0],ymm2[4,5],ymm3[6,4]
2485 ; XOP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5]
2486 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1
2487 ; XOP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
2488 ; XOP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
2489 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm0
2490 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2491 ; XOP-NEXT: vpsrld $16, %xmm0, %xmm0
2492 ; XOP-NEXT: vextractf128 $1, %ymm2, %xmm1
2493 ; XOP-NEXT: vpsrld $16, %xmm1, %xmm1
2494 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
2495 ; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
2496 ; XOP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
2497 ; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
2498 ; XOP-NEXT: vmovshdup {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7]
2499 ; XOP-NEXT: vshufps {{.*#+}} ymm4 = ymm1[3,3,3,3,7,7,7,7]
2500 ; XOP-NEXT: vshufpd {{.*#+}} ymm5 = ymm1[0,0,3,2]
2501 ; XOP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1]
2502 ; XOP-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3]
2503 ; XOP-NEXT: vmovdqa %xmm0, 16(%rsi)
2504 ; XOP-NEXT: vmovdqa %xmm7, 112(%rsi)
2505 ; XOP-NEXT: vmovdqa %xmm6, 48(%rsi)
2506 ; XOP-NEXT: vmovups %ymm1, 128(%rsi)
2507 ; XOP-NEXT: vmovupd %ymm5, 192(%rsi)
2508 ; XOP-NEXT: vmovups %ymm4, 224(%rsi)
2509 ; XOP-NEXT: vmovups %ymm3, 160(%rsi)
2510 ; XOP-NEXT: vmovups %ymm2, 64(%rsi)
2511 ; XOP-NEXT: vzeroupper
2513 %i = load <64 x i32>, ptr %input, align 16
2514 %i2 = shufflevector <64 x i32> %i, <64 x i32> poison, <8 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60>
2515 %i3 = lshr <8 x i32> %i2, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
2516 %i4 = add <8 x i32> zeroinitializer, %i3
2517 %i5 = shufflevector <8 x i32> %i4, <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2518 %i6 = shufflevector <16 x i32> %i5, <16 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2519 %i7 = shufflevector <32 x i32> poison, <32 x i32> %i6, <64 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58, i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59, i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60, i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63>
2520 store <64 x i32> %i7, ptr %output, align 16