1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
3 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42
4 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
5 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
6 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+xop | FileCheck %s --check-prefix=XOP
8 define void @v3i64(<2 x i64> %a, <2 x i64> %b, <3 x i64>* %p) nounwind {
11 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
12 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
13 ; SSE2-NEXT: movq %xmm2, 16(%rdi)
14 ; SSE2-NEXT: movdqa %xmm0, (%rdi)
19 ; SSE42-NEXT: pextrq $1, %xmm0, 16(%rdi)
20 ; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
21 ; SSE42-NEXT: movdqa %xmm0, (%rdi)
26 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm0[0],xmm1[0]
27 ; AVX1-NEXT: vpextrq $1, %xmm0, 16(%rdi)
28 ; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
33 ; AVX2-NEXT: # kill: %xmm0<def> %xmm0<kill> %ymm0<def>
34 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
35 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
36 ; AVX2-NEXT: vpextrq $1, %xmm0, 16(%rdi)
37 ; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
38 ; AVX2-NEXT: vzeroupper
43 ; XOP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm0[0],xmm1[0]
44 ; XOP-NEXT: vpextrq $1, %xmm0, 16(%rdi)
45 ; XOP-NEXT: vmovdqa %xmm1, (%rdi)
47 %r = shufflevector <2 x i64> %a, <2 x i64> %b, <3 x i32> <i32 0, i32 2, i32 1>
48 store <3 x i64> %r, <3 x i64>* %p
51 define void @v3f64(<2 x double> %a, <2 x double> %b, <3 x double>* %p) nounwind {
54 ; SSE-NEXT: movhpd %xmm0, 16(%rdi)
55 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
56 ; SSE-NEXT: movapd %xmm0, (%rdi)
61 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0]
62 ; AVX1-NEXT: vmovhpd %xmm0, 16(%rdi)
63 ; AVX1-NEXT: vmovapd %xmm1, (%rdi)
68 ; AVX2-NEXT: # kill: %xmm0<def> %xmm0<kill> %ymm0<def>
69 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
70 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3]
71 ; AVX2-NEXT: vmovhpd %xmm0, 16(%rdi)
72 ; AVX2-NEXT: vmovapd %xmm1, (%rdi)
73 ; AVX2-NEXT: vzeroupper
78 ; XOP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0]
79 ; XOP-NEXT: vmovhpd %xmm0, 16(%rdi)
80 ; XOP-NEXT: vmovapd %xmm1, (%rdi)
82 %r = shufflevector <2 x double> %a, <2 x double> %b, <3 x i32> <i32 0, i32 2, i32 1>
83 store <3 x double> %r, <3 x double>* %p
87 define void @v3i32(<2 x i32> %a, <2 x i32> %b, <3 x i32>* %p) nounwind {
90 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
91 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
92 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
93 ; SSE2-NEXT: movd %xmm0, 8(%rdi)
94 ; SSE2-NEXT: movq %xmm2, (%rdi)
99 ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
100 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
101 ; SSE42-NEXT: pextrd $2, %xmm0, 8(%rdi)
102 ; SSE42-NEXT: movq %xmm1, (%rdi)
107 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
108 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
109 ; AVX1-NEXT: vpextrd $2, %xmm0, 8(%rdi)
110 ; AVX1-NEXT: vmovq %xmm1, (%rdi)
115 ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
116 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3]
117 ; AVX2-NEXT: vextractps $2, %xmm0, 8(%rdi)
118 ; AVX2-NEXT: vmovlps %xmm1, (%rdi)
123 ; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
124 ; XOP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
125 ; XOP-NEXT: vpextrd $2, %xmm0, 8(%rdi)
126 ; XOP-NEXT: vmovq %xmm1, (%rdi)
128 %r = shufflevector <2 x i32> %a, <2 x i32> %b, <3 x i32> <i32 0, i32 2, i32 1>
129 store <3 x i32> %r, <3 x i32>* %p
133 define void @v5i16(<4 x i16> %a, <4 x i16> %b, <5 x i16>* %p) nounwind {
136 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
137 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
138 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
139 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
140 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
141 ; SSE2-NEXT: pextrw $6, %xmm0, %eax
142 ; SSE2-NEXT: movw %ax, 8(%rdi)
143 ; SSE2-NEXT: movq %xmm2, (%rdi)
146 ; SSE42-LABEL: v5i16:
148 ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
149 ; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
150 ; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
151 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
152 ; SSE42-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
153 ; SSE42-NEXT: pextrw $6, %xmm0, 8(%rdi)
154 ; SSE42-NEXT: movq %xmm2, (%rdi)
159 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
160 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
161 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
162 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
163 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
164 ; AVX-NEXT: vpextrw $6, %xmm0, 8(%rdi)
165 ; AVX-NEXT: vmovq %xmm1, (%rdi)
170 ; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm0[0,1],xmm1[4,5],xmm0[4,5],xmm1[8,9],xmm0[12,13],xmm1[4,5],xmm0[14,15],xmm1[6,7]
171 ; XOP-NEXT: vpextrw $6, %xmm0, 8(%rdi)
172 ; XOP-NEXT: vmovq %xmm1, (%rdi)
174 %r = shufflevector <4 x i16> %a, <4 x i16> %b, <5 x i32> <i32 0, i32 5, i32 1, i32 6, i32 3>
175 store <5 x i16> %r, <5 x i16>* %p
179 define void @v5i32(<4 x i32> %a, <4 x i32> %b, <5 x i32>* %p) nounwind {
182 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,2,2,3]
183 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
184 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
185 ; SSE2-NEXT: movd %xmm2, 16(%rdi)
186 ; SSE2-NEXT: movdqa %xmm0, (%rdi)
189 ; SSE42-LABEL: v5i32:
191 ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2]
192 ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
193 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
194 ; SSE42-NEXT: pextrd $3, %xmm0, 16(%rdi)
195 ; SSE42-NEXT: movdqa %xmm2, (%rdi)
200 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,1],xmm1[1,2]
201 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,1,3]
202 ; AVX1-NEXT: vextractps $3, %xmm0, 16(%rdi)
203 ; AVX1-NEXT: vmovaps %xmm1, (%rdi)
208 ; AVX2-NEXT: # kill: %xmm0<def> %xmm0<kill> %ymm0<def>
209 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
210 ; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,5,1,6,3,u,u,u>
211 ; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
212 ; AVX2-NEXT: vextractps $3, %xmm0, 16(%rdi)
213 ; AVX2-NEXT: vmovaps %xmm1, (%rdi)
214 ; AVX2-NEXT: vzeroupper
219 ; XOP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,1],xmm1[1,2]
220 ; XOP-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,1,3]
221 ; XOP-NEXT: vextractps $3, %xmm0, 16(%rdi)
222 ; XOP-NEXT: vmovaps %xmm1, (%rdi)
224 %r = shufflevector <4 x i32> %a, <4 x i32> %b, <5 x i32> <i32 0, i32 5, i32 1, i32 6, i32 3>
225 store <5 x i32> %r, <5 x i32>* %p
229 define void @v5f32(<4 x float> %a, <4 x float> %b, <5 x float>* %p) nounwind {
232 ; SSE2-NEXT: movaps %xmm0, %xmm2
233 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[1,2]
234 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
235 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
236 ; SSE2-NEXT: movss %xmm0, 16(%rdi)
237 ; SSE2-NEXT: movaps %xmm2, (%rdi)
240 ; SSE42-LABEL: v5f32:
242 ; SSE42-NEXT: extractps $3, %xmm0, 16(%rdi)
243 ; SSE42-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,2]
244 ; SSE42-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
245 ; SSE42-NEXT: movaps %xmm0, (%rdi)
250 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,1],xmm1[1,2]
251 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,1,3]
252 ; AVX1-NEXT: vextractps $3, %xmm0, 16(%rdi)
253 ; AVX1-NEXT: vmovaps %xmm1, (%rdi)
258 ; AVX2-NEXT: # kill: %xmm0<def> %xmm0<kill> %ymm0<def>
259 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
260 ; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,5,1,6,3,u,u,u>
261 ; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
262 ; AVX2-NEXT: vextractps $3, %xmm0, 16(%rdi)
263 ; AVX2-NEXT: vmovaps %xmm1, (%rdi)
264 ; AVX2-NEXT: vzeroupper
269 ; XOP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,1],xmm1[1,2]
270 ; XOP-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,1,3]
271 ; XOP-NEXT: vextractps $3, %xmm0, 16(%rdi)
272 ; XOP-NEXT: vmovaps %xmm1, (%rdi)
274 %r = shufflevector <4 x float> %a, <4 x float> %b, <5 x i32> <i32 0, i32 5, i32 1, i32 6, i32 3>
275 store <5 x float> %r, <5 x float>* %p
279 define void @v7i8(<4 x i8> %a, <4 x i8> %b, <7 x i8>* %p) nounwind {
282 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3]
283 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,0,65535,0,65535,65535,65535]
284 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3]
285 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,0,4,5,6,7]
286 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7]
287 ; SSE2-NEXT: pand %xmm2, %xmm1
288 ; SSE2-NEXT: pandn %xmm0, %xmm2
289 ; SSE2-NEXT: por %xmm1, %xmm2
290 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
291 ; SSE2-NEXT: pand %xmm2, %xmm0
292 ; SSE2-NEXT: packuswb %xmm0, %xmm0
293 ; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp)
294 ; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
295 ; SSE2-NEXT: movb %al, 6(%rdi)
296 ; SSE2-NEXT: movd %xmm0, (%rdi)
297 ; SSE2-NEXT: pextrw $2, %xmm0, %eax
298 ; SSE2-NEXT: movw %ax, 4(%rdi)
303 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3]
304 ; SSE42-NEXT: pextrb $0, %xmm1, 6(%rdi)
305 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15]
306 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7]
307 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
308 ; SSE42-NEXT: pextrw $2, %xmm1, 4(%rdi)
309 ; SSE42-NEXT: movd %xmm1, (%rdi)
314 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,1,3]
315 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15]
316 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5,6,7]
317 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
318 ; AVX-NEXT: vpextrb $0, %xmm1, 6(%rdi)
319 ; AVX-NEXT: vpextrw $2, %xmm0, 4(%rdi)
320 ; AVX-NEXT: vmovd %xmm0, (%rdi)
325 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm1[8],xmm0[12],xmm1[8],xmm0[4],xmm1[12,0,14,u,u,u,u,u,u,u,u]
326 ; XOP-NEXT: vpextrb $0, %xmm1, 6(%rdi)
327 ; XOP-NEXT: vpextrw $2, %xmm0, 4(%rdi)
328 ; XOP-NEXT: vmovd %xmm0, (%rdi)
330 %r = shufflevector <4 x i8> %a, <4 x i8> %b, <7 x i32> <i32 0, i32 6, i32 3, i32 6, i32 1, i32 7, i32 4>
331 store <7 x i8> %r, <7 x i8>* %p
335 define void @v7i16(<4 x i16> %a, <4 x i16> %b, <7 x i16>* %p) nounwind {
338 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3]
339 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,0,65535,0,65535,65535,65535]
340 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,0,3]
341 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,0,4,5,6,7]
342 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,4,7]
343 ; SSE2-NEXT: pand %xmm2, %xmm3
344 ; SSE2-NEXT: pandn %xmm0, %xmm2
345 ; SSE2-NEXT: por %xmm3, %xmm2
346 ; SSE2-NEXT: movd %xmm1, %eax
347 ; SSE2-NEXT: movw %ax, 12(%rdi)
348 ; SSE2-NEXT: movq %xmm2, (%rdi)
349 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
350 ; SSE2-NEXT: movd %xmm0, 8(%rdi)
353 ; SSE42-LABEL: v7i16:
355 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3]
356 ; SSE42-NEXT: pextrw $0, %xmm1, 12(%rdi)
357 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15]
358 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7]
359 ; SSE42-NEXT: pextrd $2, %xmm1, 8(%rdi)
360 ; SSE42-NEXT: movq %xmm1, (%rdi)
365 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,1,3]
366 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15]
367 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5,6,7]
368 ; AVX-NEXT: vpextrw $0, %xmm1, 12(%rdi)
369 ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rdi)
370 ; AVX-NEXT: vmovq %xmm0, (%rdi)
375 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,1],xmm1[8,9],xmm0[12,13],xmm1[8,9],xmm0[4,5],xmm1[12,13,0,1,14,15]
376 ; XOP-NEXT: vpextrw $0, %xmm1, 12(%rdi)
377 ; XOP-NEXT: vpextrd $2, %xmm0, 8(%rdi)
378 ; XOP-NEXT: vmovq %xmm0, (%rdi)
380 %r = shufflevector <4 x i16> %a, <4 x i16> %b, <7 x i32> <i32 0, i32 6, i32 3, i32 6, i32 1, i32 7, i32 4>
381 store <7 x i16> %r, <7 x i16>* %p
386 define void @v7i32(<4 x i32> %a, <4 x i32> %b, <7 x i32>* %p) nounwind {
389 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,2,2]
390 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,3]
391 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
392 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[3,0]
393 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3]
394 ; SSE2-NEXT: movd %xmm1, 24(%rdi)
395 ; SSE2-NEXT: movlps %xmm0, 16(%rdi)
396 ; SSE2-NEXT: movdqa %xmm3, (%rdi)
399 ; SSE42-LABEL: v7i32:
401 ; SSE42-NEXT: movdqa %xmm1, %xmm2
402 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5,6,7]
403 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
404 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,2]
405 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,0,3]
406 ; SSE42-NEXT: movd %xmm1, 24(%rdi)
407 ; SSE42-NEXT: movq %xmm2, 16(%rdi)
408 ; SSE42-NEXT: movdqa %xmm0, (%rdi)
413 ; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3]
414 ; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,3,2]
415 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
416 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,0,3]
417 ; AVX1-NEXT: vmovss %xmm1, 24(%rdi)
418 ; AVX1-NEXT: vmovlps %xmm0, 16(%rdi)
419 ; AVX1-NEXT: vmovaps %xmm2, (%rdi)
424 ; AVX2-NEXT: # kill: %xmm0<def> %xmm0<kill> %ymm0<def>
425 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
426 ; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,6,3,6,1,7,4,u>
427 ; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
428 ; AVX2-NEXT: vmovss %xmm1, 24(%rdi)
429 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
430 ; AVX2-NEXT: vmovlps %xmm1, 16(%rdi)
431 ; AVX2-NEXT: vmovaps %xmm0, (%rdi)
432 ; AVX2-NEXT: vzeroupper
437 ; XOP-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3]
438 ; XOP-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,3,2]
439 ; XOP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
440 ; XOP-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,0,3]
441 ; XOP-NEXT: vmovss %xmm1, 24(%rdi)
442 ; XOP-NEXT: vmovlps %xmm0, 16(%rdi)
443 ; XOP-NEXT: vmovaps %xmm2, (%rdi)
445 %r = shufflevector <4 x i32> %a, <4 x i32> %b, <7 x i32> <i32 0, i32 6, i32 3, i32 6, i32 1, i32 7, i32 4>
446 store <7 x i32> %r, <7 x i32>* %p
450 define void @v12i8(<8 x i8> %a, <8 x i8> %b, <12 x i8>* %p) nounwind {
453 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
454 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
455 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
456 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,255]
457 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
458 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
459 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7]
460 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
461 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
462 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7]
463 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,4]
464 ; SSE2-NEXT: packuswb %xmm3, %xmm0
465 ; SSE2-NEXT: pand %xmm2, %xmm0
466 ; SSE2-NEXT: pandn %xmm1, %xmm2
467 ; SSE2-NEXT: por %xmm0, %xmm2
468 ; SSE2-NEXT: movq %xmm2, (%rdi)
469 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
470 ; SSE2-NEXT: movd %xmm0, 8(%rdi)
473 ; SSE42-LABEL: v12i8:
475 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,xmm1[0],zero,zero,xmm1[2],zero,zero,xmm1[4],zero,zero,xmm1[6,u,u,u,u]
476 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8],zero,xmm0[2,10],zero,xmm0[4,12],zero,xmm0[6,14],zero,xmm0[u,u,u,u]
477 ; SSE42-NEXT: por %xmm1, %xmm0
478 ; SSE42-NEXT: pextrd $2, %xmm0, 8(%rdi)
479 ; SSE42-NEXT: movq %xmm0, (%rdi)
484 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[0],zero,zero,xmm1[2],zero,zero,xmm1[4],zero,zero,xmm1[6,u,u,u,u]
485 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8],zero,xmm0[2,10],zero,xmm0[4,12],zero,xmm0[6,14],zero,xmm0[u,u,u,u]
486 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
487 ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rdi)
488 ; AVX-NEXT: vmovq %xmm0, (%rdi)
493 ; XOP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[0],zero,zero,xmm1[2],zero,zero,xmm1[4],zero,zero,xmm1[6,u,u,u,u]
494 ; XOP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8],zero,xmm0[2,10],zero,xmm0[4,12],zero,xmm0[6,14],zero,xmm0[u,u,u,u]
495 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
496 ; XOP-NEXT: vpextrd $2, %xmm0, 8(%rdi)
497 ; XOP-NEXT: vmovq %xmm0, (%rdi)
499 %r = shufflevector <8 x i8> %a, <8 x i8> %b, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
500 store <12 x i8> %r, <12 x i8>* %p
504 define void @v12i16(<8 x i16> %a, <8 x i16> %b, <12 x i16>* %p) nounwind {
505 ; SSE2-LABEL: v12i16:
507 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,3]
508 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535]
509 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,6,5,4,7]
510 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
511 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,1,4,5,6,7]
512 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,6,4]
513 ; SSE2-NEXT: pand %xmm3, %xmm4
514 ; SSE2-NEXT: pandn %xmm2, %xmm3
515 ; SSE2-NEXT: por %xmm4, %xmm3
516 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
517 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,65535,65535,65535,65535,65535]
518 ; SSE2-NEXT: pand %xmm2, %xmm1
519 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
520 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,1,3,4,5,6,7]
521 ; SSE2-NEXT: pandn %xmm0, %xmm2
522 ; SSE2-NEXT: por %xmm1, %xmm2
523 ; SSE2-NEXT: movq %xmm2, 16(%rdi)
524 ; SSE2-NEXT: movdqa %xmm3, (%rdi)
527 ; SSE42-LABEL: v12i16:
529 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
530 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
531 ; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7]
532 ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1,2],xmm2[3,4,5,6,7]
533 ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,3]
534 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,2,3,10,11,10,11,4,5,12,13]
535 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
536 ; SSE42-NEXT: movdqa %xmm0, (%rdi)
537 ; SSE42-NEXT: movq %xmm3, 16(%rdi)
540 ; AVX1-LABEL: v12i16:
542 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
543 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
544 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7]
545 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7]
546 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,3]
547 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,2,3,10,11,10,11,4,5,12,13]
548 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
549 ; AVX1-NEXT: vmovdqa %xmm0, (%rdi)
550 ; AVX1-NEXT: vmovq %xmm2, 16(%rdi)
553 ; AVX2-LABEL: v12i16:
555 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
556 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
557 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7]
558 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7]
559 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
560 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,2,3,10,11,10,11,4,5,12,13]
561 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
562 ; AVX2-NEXT: vmovdqa %xmm0, (%rdi)
563 ; AVX2-NEXT: vmovq %xmm2, 16(%rdi)
568 ; XOP-NEXT: vpperm {{.*#+}} xmm2 = xmm0[0,1,8,9],xmm1[0,1],xmm0[2,3,10,11],xmm1[2,3],xmm0[4,5,12,13]
569 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[4,5],xmm0[6,7,14,15],xmm1[6,7],xmm0[8,9,10,11,12,13,14,15]
570 ; XOP-NEXT: vmovq %xmm0, 16(%rdi)
571 ; XOP-NEXT: vmovdqa %xmm2, (%rdi)
573 %r = shufflevector <8 x i16> %a, <8 x i16> %b, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
574 store <12 x i16> %r, <12 x i16>* %p
578 define void @v12i32(<8 x i32> %a, <8 x i32> %b, <12 x i32>* %p) nounwind {
579 ; SSE2-LABEL: v12i32:
581 ; SSE2-NEXT: movdqa %xmm0, %xmm3
582 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
583 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,2]
584 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,0,1]
585 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0]
586 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2]
587 ; SSE2-NEXT: movdqa %xmm2, %xmm4
588 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm1[1,0]
589 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[2,2]
590 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
591 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[3,0]
592 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2]
593 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,3,2,2]
594 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
595 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
596 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
597 ; SSE2-NEXT: movaps %xmm0, 32(%rdi)
598 ; SSE2-NEXT: movaps %xmm4, 16(%rdi)
599 ; SSE2-NEXT: movaps %xmm3, (%rdi)
602 ; SSE42-LABEL: v12i32:
604 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
605 ; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,0,1]
606 ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5,6,7]
607 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
608 ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5],xmm4[6,7]
609 ; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,2,2]
610 ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3],xmm4[4,5,6,7]
611 ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm0[4,5],xmm4[6,7]
612 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
613 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
614 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5,6,7]
615 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3]
616 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
617 ; SSE42-NEXT: movdqa %xmm0, 32(%rdi)
618 ; SSE42-NEXT: movdqa %xmm4, 16(%rdi)
619 ; SSE42-NEXT: movdqa %xmm3, (%rdi)
622 ; AVX1-LABEL: v12i32:
624 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
625 ; AVX1-NEXT: vmovsldup {{.*#+}} ymm2 = ymm2[0,0,2,2,4,4,6,6]
626 ; AVX1-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[0,u,u,1,5,u,u,6]
627 ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4,5],ymm2[6],ymm3[7]
628 ; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = xmm1[0,0]
629 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
630 ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
631 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
632 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm3[3,3]
633 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,1]
634 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
635 ; AVX1-NEXT: vmovapd %xmm0, 32(%rdi)
636 ; AVX1-NEXT: vmovaps %ymm2, (%rdi)
637 ; AVX1-NEXT: vzeroupper
640 ; AVX2-LABEL: v12i32:
642 ; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3]
643 ; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[3,3,2,3,7,7,6,7]
644 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,3]
645 ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
646 ; AVX2-NEXT: vmovaps {{.*#+}} ymm3 = <0,4,u,1,5,u,2,6>
647 ; AVX2-NEXT: vpermps %ymm0, %ymm3, %ymm0
648 ; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1
649 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
650 ; AVX2-NEXT: vmovaps %ymm0, (%rdi)
651 ; AVX2-NEXT: vmovaps %xmm2, 32(%rdi)
652 ; AVX2-NEXT: vzeroupper
657 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
658 ; XOP-NEXT: vpermil2ps {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[u,1,5,u],ymm2[6],ymm0[6]
659 ; XOP-NEXT: vmovddup {{.*#+}} xmm3 = xmm1[0,0]
660 ; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
661 ; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
662 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3
663 ; XOP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm3[3,3]
664 ; XOP-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,1]
665 ; XOP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
666 ; XOP-NEXT: vmovapd %xmm0, 32(%rdi)
667 ; XOP-NEXT: vmovaps %ymm2, (%rdi)
668 ; XOP-NEXT: vzeroupper
670 %r = shufflevector <8 x i32> %a, <8 x i32> %b, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
671 store <12 x i32> %r, <12 x i32>* %p
675 define void @pr29025(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <12 x i8> *%p) nounwind {
676 ; SSE2-LABEL: pr29025:
678 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255]
679 ; SSE2-NEXT: pand %xmm3, %xmm1
680 ; SSE2-NEXT: pand %xmm3, %xmm0
681 ; SSE2-NEXT: packuswb %xmm1, %xmm0
682 ; SSE2-NEXT: packuswb %xmm0, %xmm0
683 ; SSE2-NEXT: pxor %xmm1, %xmm1
684 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
685 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
686 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7]
687 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
688 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
689 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7]
690 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,4]
691 ; SSE2-NEXT: packuswb %xmm1, %xmm0
692 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,255]
693 ; SSE2-NEXT: pand %xmm1, %xmm0
694 ; SSE2-NEXT: pand %xmm3, %xmm2
695 ; SSE2-NEXT: packuswb %xmm2, %xmm2
696 ; SSE2-NEXT: packuswb %xmm2, %xmm2
697 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,1,1,4,5,6,7]
698 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
699 ; SSE2-NEXT: pandn %xmm2, %xmm1
700 ; SSE2-NEXT: por %xmm0, %xmm1
701 ; SSE2-NEXT: movq %xmm1, (%rdi)
702 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
703 ; SSE2-NEXT: movd %xmm0, 8(%rdi)
706 ; SSE42-LABEL: pr29025:
708 ; SSE42-NEXT: movdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
709 ; SSE42-NEXT: pshufb %xmm3, %xmm1
710 ; SSE42-NEXT: pshufb %xmm3, %xmm0
711 ; SSE42-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
712 ; SSE42-NEXT: pshufb %xmm3, %xmm2
713 ; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
714 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u]
715 ; SSE42-NEXT: pextrd $2, %xmm0, 8(%rdi)
716 ; SSE42-NEXT: movq %xmm0, (%rdi)
719 ; AVX-LABEL: pr29025:
721 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
722 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
723 ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
724 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
725 ; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm1
726 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
727 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u]
728 ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rdi)
729 ; AVX-NEXT: vmovq %xmm0, (%rdi)
732 ; XOP-LABEL: pr29025:
734 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,4,8,12],xmm1[0,4,8,12],xmm0[u,u,u,u,u,u,u,u]
735 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,4],xmm2[0],xmm0[1,5],xmm2[4],xmm0[2,6],xmm2[8],xmm0[3,7],xmm2[12],xmm0[u,u,u,u]
736 ; XOP-NEXT: vpextrd $2, %xmm0, 8(%rdi)
737 ; XOP-NEXT: vmovq %xmm0, (%rdi)
739 %s1 = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
740 %s2 = shufflevector <4 x i8> %c, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
741 %r = shufflevector <8 x i8> %s1, <8 x i8> %s2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
742 store <12 x i8> %r, <12 x i8>* %p, align 1
746 define void @interleave_24i8_out(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8 x i8>* %q3) nounwind {
747 ; SSE2-LABEL: interleave_24i8_out:
749 ; SSE2-NEXT: movdqu (%rdi), %xmm0
750 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
751 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,255,255,0,255,255,255,255,255,255,255,255,255,255]
752 ; SSE2-NEXT: movdqa %xmm0, %xmm2
753 ; SSE2-NEXT: pand %xmm3, %xmm2
754 ; SSE2-NEXT: pandn %xmm1, %xmm3
755 ; SSE2-NEXT: por %xmm2, %xmm3
756 ; SSE2-NEXT: pxor %xmm2, %xmm2
757 ; SSE2-NEXT: movdqa %xmm3, %xmm4
758 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
759 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,0]
760 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
761 ; SSE2-NEXT: pand %xmm5, %xmm3
762 ; SSE2-NEXT: pandn %xmm4, %xmm5
763 ; SSE2-NEXT: por %xmm3, %xmm5
764 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,1,3]
765 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5]
766 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1]
767 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7]
768 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
769 ; SSE2-NEXT: packuswb %xmm0, %xmm3
770 ; SSE2-NEXT: movq %xmm3, (%rsi)
771 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,0,255,255,0,255,255,255,255,255,255,255,255,255]
772 ; SSE2-NEXT: movdqa %xmm0, %xmm4
773 ; SSE2-NEXT: pand %xmm3, %xmm4
774 ; SSE2-NEXT: pandn %xmm1, %xmm3
775 ; SSE2-NEXT: por %xmm4, %xmm3
776 ; SSE2-NEXT: movdqa %xmm3, %xmm4
777 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
778 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535]
779 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
780 ; SSE2-NEXT: pand %xmm5, %xmm3
781 ; SSE2-NEXT: pandn %xmm4, %xmm5
782 ; SSE2-NEXT: por %xmm3, %xmm5
783 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,1,0,3,4,5,6,7]
784 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
785 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1]
786 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,0,4,5,6,7]
787 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4]
788 ; SSE2-NEXT: packuswb %xmm0, %xmm3
789 ; SSE2-NEXT: movq %xmm3, (%rdx)
790 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255]
791 ; SSE2-NEXT: pand %xmm3, %xmm0
792 ; SSE2-NEXT: pandn %xmm1, %xmm3
793 ; SSE2-NEXT: por %xmm0, %xmm3
794 ; SSE2-NEXT: movdqa %xmm3, %xmm0
795 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
796 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,65535,65535,0,65535]
797 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
798 ; SSE2-NEXT: pand %xmm1, %xmm3
799 ; SSE2-NEXT: pandn %xmm0, %xmm1
800 ; SSE2-NEXT: por %xmm3, %xmm1
801 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0]
802 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
803 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
804 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
805 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
806 ; SSE2-NEXT: packuswb %xmm0, %xmm0
807 ; SSE2-NEXT: movq %xmm0, (%rcx)
810 ; SSE42-LABEL: interleave_24i8_out:
812 ; SSE42-NEXT: movdqu (%rdi), %xmm0
813 ; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
814 ; SSE42-NEXT: movdqa %xmm1, %xmm2
815 ; SSE42-NEXT: pshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[2,5,u,u,u,u,u,u,u,u]
816 ; SSE42-NEXT: movdqa %xmm0, %xmm3
817 ; SSE42-NEXT: pshufb {{.*#+}} xmm3 = xmm3[0,3,6,9,12,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
818 ; SSE42-NEXT: por %xmm2, %xmm3
819 ; SSE42-NEXT: movq %xmm3, (%rsi)
820 ; SSE42-NEXT: movdqa %xmm1, %xmm2
821 ; SSE42-NEXT: pshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[0,3,6,u,u,u,u,u,u,u,u]
822 ; SSE42-NEXT: movdqa %xmm0, %xmm3
823 ; SSE42-NEXT: pshufb {{.*#+}} xmm3 = xmm3[1,4,7,10,13],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
824 ; SSE42-NEXT: por %xmm2, %xmm3
825 ; SSE42-NEXT: movq %xmm3, (%rdx)
826 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
827 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
828 ; SSE42-NEXT: por %xmm1, %xmm0
829 ; SSE42-NEXT: movq %xmm0, (%rcx)
832 ; AVX-LABEL: interleave_24i8_out:
834 ; AVX-NEXT: vmovdqu (%rdi), %xmm0
835 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
836 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
837 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
838 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
839 ; AVX-NEXT: vmovq %xmm2, (%rsi)
840 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
841 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
842 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
843 ; AVX-NEXT: vmovq %xmm2, (%rdx)
844 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
845 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
846 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
847 ; AVX-NEXT: vmovq %xmm0, (%rcx)
850 ; XOP-LABEL: interleave_24i8_out:
852 ; XOP-NEXT: vmovdqu (%rdi), %xmm0
853 ; XOP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
854 ; XOP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
855 ; XOP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
856 ; XOP-NEXT: vpor %xmm2, %xmm3, %xmm2
857 ; XOP-NEXT: vmovq %xmm2, (%rsi)
858 ; XOP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
859 ; XOP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
860 ; XOP-NEXT: vpor %xmm2, %xmm3, %xmm2
861 ; XOP-NEXT: vmovq %xmm2, (%rdx)
862 ; XOP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
863 ; XOP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
864 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
865 ; XOP-NEXT: vmovq %xmm0, (%rcx)
867 %wide.vec = load <24 x i8>, <24 x i8>* %p, align 4
868 %s1 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
869 %s2 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
870 %s3 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
871 store <8 x i8> %s1, <8 x i8>* %q1, align 4
872 store <8 x i8> %s2, <8 x i8>* %q2, align 4
873 store <8 x i8> %s3, <8 x i8>* %q3, align 4
877 define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8 x i8>* %q3) nounwind {
878 ; SSE2-LABEL: interleave_24i8_in:
880 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
881 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
882 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
883 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
884 ; SSE2-NEXT: pxor %xmm2, %xmm2
885 ; SSE2-NEXT: movdqa %xmm1, %xmm3
886 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
887 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,2,2]
888 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535]
889 ; SSE2-NEXT: pand %xmm5, %xmm4
890 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
891 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,1,3,3,4,5,6,7]
892 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,6,7]
893 ; SSE2-NEXT: pandn %xmm2, %xmm5
894 ; SSE2-NEXT: por %xmm4, %xmm5
895 ; SSE2-NEXT: movdqa %xmm3, %xmm2
896 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
897 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
898 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7]
899 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,4,5]
900 ; SSE2-NEXT: packuswb %xmm5, %xmm2
901 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255]
902 ; SSE2-NEXT: pand %xmm4, %xmm2
903 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,0,1]
904 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,0,0,3,4,5,6,7]
905 ; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,6,6]
906 ; SSE2-NEXT: pandn %xmm5, %xmm4
907 ; SSE2-NEXT: por %xmm2, %xmm4
908 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
909 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
910 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,1,0,4,5,6,7]
911 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
912 ; SSE2-NEXT: packuswb %xmm0, %xmm1
913 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255]
914 ; SSE2-NEXT: pand %xmm2, %xmm1
915 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7]
916 ; SSE2-NEXT: pandn %xmm0, %xmm2
917 ; SSE2-NEXT: por %xmm1, %xmm2
918 ; SSE2-NEXT: movq %xmm2, 16(%rdi)
919 ; SSE2-NEXT: movdqu %xmm4, (%rdi)
922 ; SSE42-LABEL: interleave_24i8_in:
924 ; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
925 ; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
926 ; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
927 ; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
928 ; SSE42-NEXT: movdqa %xmm0, %xmm2
929 ; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,8],zero,xmm2[1,9],zero,xmm2[2,10],zero,xmm2[3,11],zero,xmm2[4,12],zero,xmm2[5]
930 ; SSE42-NEXT: movdqa %xmm1, %xmm3
931 ; SSE42-NEXT: pshufb {{.*#+}} xmm3 = zero,zero,xmm3[0],zero,zero,xmm3[1],zero,zero,xmm3[2],zero,zero,xmm3[3],zero,zero,xmm3[4],zero
932 ; SSE42-NEXT: por %xmm2, %xmm3
933 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u]
934 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u]
935 ; SSE42-NEXT: por %xmm0, %xmm1
936 ; SSE42-NEXT: movq %xmm1, 16(%rdi)
937 ; SSE42-NEXT: movdqu %xmm3, (%rdi)
940 ; AVX-LABEL: interleave_24i8_in:
942 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
943 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
944 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
945 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
946 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5]
947 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero
948 ; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2
949 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u]
950 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u]
951 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
952 ; AVX-NEXT: vmovq %xmm0, 16(%rdi)
953 ; AVX-NEXT: vmovdqu %xmm2, (%rdi)
956 ; XOP-LABEL: interleave_24i8_in:
958 ; XOP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
959 ; XOP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
960 ; XOP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
961 ; XOP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
962 ; XOP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5]
963 ; XOP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero
964 ; XOP-NEXT: vpor %xmm3, %xmm2, %xmm2
965 ; XOP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u]
966 ; XOP-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u]
967 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
968 ; XOP-NEXT: vmovq %xmm0, 16(%rdi)
969 ; XOP-NEXT: vmovdqu %xmm2, (%rdi)
971 %s1 = load <8 x i8>, <8 x i8>* %q1, align 4
972 %s2 = load <8 x i8>, <8 x i8>* %q2, align 4
973 %s3 = load <8 x i8>, <8 x i8>* %q3, align 4
974 %t1 = shufflevector <8 x i8> %s1, <8 x i8> %s2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
975 %t2 = shufflevector <8 x i8> %s3, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
976 %interleaved = shufflevector <16 x i8> %t1, <16 x i8> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
977 store <24 x i8> %interleaved, <24 x i8>* %p, align 4
982 define void @interleave_24i16_out(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2, <8 x i16>* %q3) nounwind {
983 ; SSE2-LABEL: interleave_24i16_out:
985 ; SSE2-NEXT: movdqu (%rdi), %xmm3
986 ; SSE2-NEXT: movdqu 16(%rdi), %xmm2
987 ; SSE2-NEXT: movdqu 32(%rdi), %xmm8
988 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0]
989 ; SSE2-NEXT: movdqa %xmm3, %xmm4
990 ; SSE2-NEXT: pand %xmm1, %xmm4
991 ; SSE2-NEXT: pandn %xmm2, %xmm1
992 ; SSE2-NEXT: por %xmm4, %xmm1
993 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
994 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
995 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
996 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
997 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
998 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,1,2,1]
999 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
1000 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm1[2,0]
1001 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,0]
1002 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535]
1003 ; SSE2-NEXT: movdqa %xmm4, %xmm5
1004 ; SSE2-NEXT: pandn %xmm2, %xmm5
1005 ; SSE2-NEXT: movdqa %xmm3, %xmm6
1006 ; SSE2-NEXT: pand %xmm4, %xmm6
1007 ; SSE2-NEXT: por %xmm5, %xmm6
1008 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[2,1,2,3,4,5,6,7]
1009 ; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
1010 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
1011 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,3,0,4,5,6,7]
1012 ; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,6,7]
1013 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,0,0]
1014 ; SSE2-NEXT: pand %xmm6, %xmm5
1015 ; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[0,3,2,3,4,5,6,7]
1016 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
1017 ; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6]
1018 ; SSE2-NEXT: movdqa %xmm6, %xmm0
1019 ; SSE2-NEXT: pandn %xmm7, %xmm0
1020 ; SSE2-NEXT: por %xmm5, %xmm0
1021 ; SSE2-NEXT: pand %xmm4, %xmm2
1022 ; SSE2-NEXT: pandn %xmm3, %xmm4
1023 ; SSE2-NEXT: por %xmm2, %xmm4
1024 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[3,1,2,0]
1025 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
1026 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3]
1027 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7]
1028 ; SSE2-NEXT: pand %xmm6, %xmm2
1029 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,7,6,7]
1030 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0]
1031 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,4,5]
1032 ; SSE2-NEXT: pandn %xmm3, %xmm6
1033 ; SSE2-NEXT: por %xmm2, %xmm6
1034 ; SSE2-NEXT: movups %xmm1, (%rsi)
1035 ; SSE2-NEXT: movdqu %xmm0, (%rdx)
1036 ; SSE2-NEXT: movdqu %xmm6, (%rcx)
1039 ; SSE42-LABEL: interleave_24i16_out:
1041 ; SSE42-NEXT: movdqu (%rdi), %xmm0
1042 ; SSE42-NEXT: movdqu 16(%rdi), %xmm1
1043 ; SSE42-NEXT: movdqu 32(%rdi), %xmm2
1044 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,2,1]
1045 ; SSE42-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5]
1046 ; SSE42-NEXT: movdqa %xmm0, %xmm4
1047 ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6],xmm1[7]
1048 ; SSE42-NEXT: pshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
1049 ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm3[6,7]
1050 ; SSE42-NEXT: movdqa %xmm2, %xmm3
1051 ; SSE42-NEXT: pshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,4,5,6,7,0,1,0,1,6,7,12,13]
1052 ; SSE42-NEXT: movdqa %xmm0, %xmm5
1053 ; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3,4],xmm1[5],xmm5[6,7]
1054 ; SSE42-NEXT: pshufb {{.*#+}} xmm5 = xmm5[2,3,8,9,14,15,4,5,10,11,10,11,8,9,14,15]
1055 ; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm3[5,6,7]
1056 ; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,7,8,9,2,3,8,9,14,15]
1057 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
1058 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
1059 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
1060 ; SSE42-NEXT: movdqu %xmm4, (%rsi)
1061 ; SSE42-NEXT: movdqu %xmm5, (%rdx)
1062 ; SSE42-NEXT: movdqu %xmm1, (%rcx)
1065 ; AVX1-LABEL: interleave_24i16_out:
1067 ; AVX1-NEXT: vmovdqu 32(%rdi), %xmm0
1068 ; AVX1-NEXT: vmovdqu (%rdi), %ymm1
1069 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1070 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7]
1071 ; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
1072 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1]
1073 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
1074 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7]
1075 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
1076 ; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,8,9,14,15,4,5,10,11,10,11,8,9,14,15]
1077 ; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,1,6,7,4,5,6,7,0,1,0,1,6,7,12,13]
1078 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7]
1079 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
1080 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
1081 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,2,3,8,9,14,15]
1082 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7]
1083 ; AVX1-NEXT: vmovdqu %xmm3, (%rsi)
1084 ; AVX1-NEXT: vmovdqu %xmm4, (%rdx)
1085 ; AVX1-NEXT: vmovdqu %xmm0, (%rcx)
1086 ; AVX1-NEXT: vzeroupper
1089 ; AVX2-LABEL: interleave_24i16_out:
1091 ; AVX2-NEXT: vmovdqu (%rdi), %ymm0
1092 ; AVX2-NEXT: vmovdqu 32(%rdi), %xmm1
1093 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
1094 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
1095 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6],xmm3[7]
1096 ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
1097 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
1098 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
1099 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
1100 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
1101 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
1102 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1103 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
1104 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15]
1105 ; AVX2-NEXT: vmovdqu %xmm2, (%rsi)
1106 ; AVX2-NEXT: vmovdqu %xmm3, (%rdx)
1107 ; AVX2-NEXT: vmovdqu %xmm0, (%rcx)
1108 ; AVX2-NEXT: vzeroupper
1111 ; XOP-LABEL: interleave_24i16_out:
1113 ; XOP-NEXT: vmovdqu 32(%rdi), %xmm0
1114 ; XOP-NEXT: vmovdqu (%rdi), %ymm1
1115 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2
1116 ; XOP-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7]
1117 ; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15],xmm0[4,5,10,11]
1118 ; XOP-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
1119 ; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm4[2,3,8,9,14,15,4,5,10,11],xmm0[0,1,6,7,12,13]
1120 ; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm1[4,5,10,11],xmm2[0,1,6,7,12,13,14,15,0,1,2,3]
1121 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6,7,8,9],xmm0[2,3,8,9,14,15]
1122 ; XOP-NEXT: vmovdqu %xmm3, (%rsi)
1123 ; XOP-NEXT: vmovdqu %xmm4, (%rdx)
1124 ; XOP-NEXT: vmovdqu %xmm0, (%rcx)
1125 ; XOP-NEXT: vzeroupper
1127 %wide.vec = load <24 x i16>, <24 x i16>* %p, align 4
1128 %s1 = shufflevector <24 x i16> %wide.vec, <24 x i16> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
1129 %s2 = shufflevector <24 x i16> %wide.vec, <24 x i16> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
1130 %s3 = shufflevector <24 x i16> %wide.vec, <24 x i16> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
1131 store <8 x i16> %s1, <8 x i16>* %q1, align 4
1132 store <8 x i16> %s2, <8 x i16>* %q2, align 4
1133 store <8 x i16> %s3, <8 x i16>* %q3, align 4
1137 define void @interleave_24i16_in(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2, <8 x i16>* %q3) nounwind {
1138 ; SSE2-LABEL: interleave_24i16_in:
1140 ; SSE2-NEXT: movdqu (%rsi), %xmm3
1141 ; SSE2-NEXT: movdqu (%rdx), %xmm2
1142 ; SSE2-NEXT: movdqu (%rcx), %xmm1
1143 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,0,3]
1144 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535]
1145 ; SSE2-NEXT: movdqa %xmm0, %xmm5
1146 ; SSE2-NEXT: pandn %xmm4, %xmm5
1147 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,3,3,3]
1148 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,2,2]
1149 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1150 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
1151 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7]
1152 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,4,5]
1153 ; SSE2-NEXT: pand %xmm0, %xmm3
1154 ; SSE2-NEXT: por %xmm5, %xmm3
1155 ; SSE2-NEXT: movdqa %xmm0, %xmm5
1156 ; SSE2-NEXT: pandn %xmm4, %xmm5
1157 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,3,3,4,5,6,7]
1158 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1159 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
1160 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,3,2,0,4,5,6,7]
1161 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,7]
1162 ; SSE2-NEXT: pand %xmm0, %xmm2
1163 ; SSE2-NEXT: por %xmm5, %xmm2
1164 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,0]
1165 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,2]
1166 ; SSE2-NEXT: pand %xmm5, %xmm1
1167 ; SSE2-NEXT: pandn %xmm6, %xmm5
1168 ; SSE2-NEXT: por %xmm1, %xmm5
1169 ; SSE2-NEXT: pand %xmm0, %xmm5
1170 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,4,6,7]
1171 ; SSE2-NEXT: pandn %xmm1, %xmm0
1172 ; SSE2-NEXT: por %xmm5, %xmm0
1173 ; SSE2-NEXT: movdqu %xmm0, 16(%rdi)
1174 ; SSE2-NEXT: movdqu %xmm2, 32(%rdi)
1175 ; SSE2-NEXT: movdqu %xmm3, (%rdi)
1178 ; SSE42-LABEL: interleave_24i16_in:
1180 ; SSE42-NEXT: movdqu (%rsi), %xmm0
1181 ; SSE42-NEXT: movdqu (%rdx), %xmm1
1182 ; SSE42-NEXT: movdqu (%rcx), %xmm2
1183 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,2]
1184 ; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,3,3,3]
1185 ; SSE42-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1186 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
1187 ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,0,3]
1188 ; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm0[0,1],xmm5[2],xmm0[3,4],xmm5[5],xmm0[6,7]
1189 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,2]
1190 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6],xmm3[7]
1191 ; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,3,3,4,5,6,7]
1192 ; SSE42-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,7]
1193 ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2],xmm0[3,4],xmm3[5],xmm0[6,7]
1194 ; SSE42-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1195 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[4,5,6,7,4,5,8,9,10,11,10,11,12,13,14,15]
1196 ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7]
1197 ; SSE42-NEXT: movdqu %xmm4, 32(%rdi)
1198 ; SSE42-NEXT: movdqu %xmm3, 16(%rdi)
1199 ; SSE42-NEXT: movdqu %xmm5, (%rdi)
1202 ; AVX1-LABEL: interleave_24i16_in:
1204 ; AVX1-NEXT: vmovdqu (%rsi), %xmm0
1205 ; AVX1-NEXT: vmovdqu (%rdx), %xmm1
1206 ; AVX1-NEXT: vmovdqu (%rcx), %xmm2
1207 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2]
1208 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,1,3,3,4,5,6,7]
1209 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,6,7]
1210 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
1211 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2]
1212 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7]
1213 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1214 ; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
1215 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,0,3]
1216 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7]
1217 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
1218 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1219 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
1220 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
1221 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
1222 ; AVX1-NEXT: vmovdqu %xmm0, 32(%rdi)
1223 ; AVX1-NEXT: vmovups %ymm3, (%rdi)
1224 ; AVX1-NEXT: vzeroupper
1227 ; AVX2-LABEL: interleave_24i16_in:
1229 ; AVX2-NEXT: vmovdqu (%rsi), %xmm0
1230 ; AVX2-NEXT: vmovdqu (%rdx), %xmm1
1231 ; AVX2-NEXT: vmovdqu (%rcx), %xmm2
1232 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
1233 ; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,2,3,6,7,2,3,8,9,8,9,4,5,6,7,16,17,18,19,22,23,18,19,24,25,24,25,20,21,22,23]
1234 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
1235 ; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27]
1236 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15]
1237 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
1238 ; AVX2-NEXT: vpermd %ymm2, %ymm4, %ymm4
1239 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
1240 ; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
1241 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1242 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
1243 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
1244 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
1245 ; AVX2-NEXT: vmovdqu %xmm0, 32(%rdi)
1246 ; AVX2-NEXT: vmovdqu %ymm3, (%rdi)
1247 ; AVX2-NEXT: vzeroupper
1250 ; XOP-LABEL: interleave_24i16_in:
1252 ; XOP-NEXT: vmovdqu (%rsi), %xmm0
1253 ; XOP-NEXT: vmovdqu (%rdx), %xmm1
1254 ; XOP-NEXT: vmovdqu (%rcx), %xmm2
1255 ; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm0[4,5,6,7],xmm1[6,7],xmm0[6,7,8,9],xmm1[8,9],xmm0[8,9,10,11]
1256 ; XOP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2]
1257 ; XOP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7]
1258 ; XOP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1259 ; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm2[0,1],xmm4[4,5,6,7],xmm2[2,3],xmm4[8,9,10,11]
1260 ; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
1261 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[10,11],xmm0[12,13,12,13],xmm1[12,13,12,13],xmm0[14,15],xmm1[14,15],xmm0[14,15]
1262 ; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
1263 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
1264 ; XOP-NEXT: vmovdqu %xmm0, 32(%rdi)
1265 ; XOP-NEXT: vmovups %ymm3, (%rdi)
1266 ; XOP-NEXT: vzeroupper
1268 %s1 = load <8 x i16>, <8 x i16>* %q1, align 4
1269 %s2 = load <8 x i16>, <8 x i16>* %q2, align 4
1270 %s3 = load <8 x i16>, <8 x i16>* %q3, align 4
1271 %t1 = shufflevector <8 x i16> %s1, <8 x i16> %s2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1272 %t2 = shufflevector <8 x i16> %s3, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1273 %interleaved = shufflevector <16 x i16> %t1, <16 x i16> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
1274 store <24 x i16> %interleaved, <24 x i16>* %p, align 4
1278 define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, <8 x i32>* %q3) nounwind {
1279 ; SSE2-LABEL: interleave_24i32_out:
1281 ; SSE2-NEXT: movups 80(%rdi), %xmm5
1282 ; SSE2-NEXT: movups 64(%rdi), %xmm8
1283 ; SSE2-NEXT: movups (%rdi), %xmm0
1284 ; SSE2-NEXT: movups 16(%rdi), %xmm6
1285 ; SSE2-NEXT: movups 32(%rdi), %xmm2
1286 ; SSE2-NEXT: movups 48(%rdi), %xmm1
1287 ; SSE2-NEXT: movaps %xmm1, %xmm3
1288 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm8[2,3]
1289 ; SSE2-NEXT: movaps %xmm5, %xmm4
1290 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[2,0]
1291 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,0]
1292 ; SSE2-NEXT: movaps %xmm0, %xmm4
1293 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm6[2,3]
1294 ; SSE2-NEXT: movaps %xmm2, %xmm7
1295 ; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm4[2,0]
1296 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,0]
1297 ; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,3,0,1]
1298 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[0,0]
1299 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[3,3]
1300 ; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,1,0,3]
1301 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,0]
1302 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
1303 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
1304 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm8[0,0]
1305 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[3,3]
1306 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,1,0,3]
1307 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[2,0]
1308 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0]
1309 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,2,3]
1310 ; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1]
1311 ; SSE2-NEXT: movsd {{.*#+}} xmm9 = xmm10[0],xmm9[1]
1312 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,2,3]
1313 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
1314 ; SSE2-NEXT: movsd {{.*#+}} xmm7 = xmm2[0],xmm7[1]
1315 ; SSE2-NEXT: movups %xmm3, 16(%rsi)
1316 ; SSE2-NEXT: movups %xmm4, (%rsi)
1317 ; SSE2-NEXT: movups %xmm1, 16(%rdx)
1318 ; SSE2-NEXT: movups %xmm0, (%rdx)
1319 ; SSE2-NEXT: movupd %xmm7, 16(%rcx)
1320 ; SSE2-NEXT: movupd %xmm9, (%rcx)
1323 ; SSE42-LABEL: interleave_24i32_out:
1325 ; SSE42-NEXT: movdqu 80(%rdi), %xmm9
1326 ; SSE42-NEXT: movdqu 64(%rdi), %xmm10
1327 ; SSE42-NEXT: movdqu (%rdi), %xmm4
1328 ; SSE42-NEXT: movdqu 16(%rdi), %xmm2
1329 ; SSE42-NEXT: movdqu 32(%rdi), %xmm11
1330 ; SSE42-NEXT: movdqu 48(%rdi), %xmm5
1331 ; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm11[0,1,0,1]
1332 ; SSE42-NEXT: movdqa %xmm2, %xmm7
1333 ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm4[2,3],xmm7[4,5,6,7]
1334 ; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,3,0,1]
1335 ; SSE42-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[2,3]
1336 ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm8[6,7]
1337 ; SSE42-NEXT: movdqa %xmm10, %xmm1
1338 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5,6,7]
1339 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,0,1]
1340 ; SSE42-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm10[2,3]
1341 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,1,0,1]
1342 ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,5],xmm3[6,7]
1343 ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm11[0,1,2,2]
1344 ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,0,3,3]
1345 ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm5[6,7]
1346 ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,0,3,3]
1347 ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,1,2,2]
1348 ; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,5],xmm5[6,7]
1349 ; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm2[2,3],xmm6[4,5,6,7]
1350 ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,0,3]
1351 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7]
1352 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3],xmm0[4,5,6,7]
1353 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,1,0,3]
1354 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1355 ; SSE42-NEXT: movdqu %xmm3, 16(%rsi)
1356 ; SSE42-NEXT: movdqu %xmm4, (%rsi)
1357 ; SSE42-NEXT: movdqu %xmm5, 16(%rdx)
1358 ; SSE42-NEXT: movdqu %xmm7, (%rdx)
1359 ; SSE42-NEXT: movdqu %xmm2, 16(%rcx)
1360 ; SSE42-NEXT: movdqu %xmm1, (%rcx)
1363 ; AVX1-LABEL: interleave_24i32_out:
1365 ; AVX1-NEXT: vmovups (%rdi), %ymm0
1366 ; AVX1-NEXT: vmovups 32(%rdi), %ymm1
1367 ; AVX1-NEXT: vmovups 64(%rdi), %ymm2
1368 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
1369 ; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm2[2],xmm3[1]
1370 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
1371 ; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
1372 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
1373 ; AVX1-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3]
1374 ; AVX1-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,3,2,1]
1375 ; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,3,2,3]
1376 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
1377 ; AVX1-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3]
1378 ; AVX1-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm3[2],xmm2[3]
1379 ; AVX1-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,0,3,2]
1380 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
1381 ; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1382 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7
1383 ; AVX1-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3]
1384 ; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[1,0,3,2]
1385 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm7 = xmm7[1,1,3,3]
1386 ; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
1387 ; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7]
1388 ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,3]
1389 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
1390 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
1391 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1392 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1393 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3]
1394 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1395 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1396 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
1397 ; AVX1-NEXT: vmovupd %ymm4, (%rsi)
1398 ; AVX1-NEXT: vmovups %ymm5, (%rdx)
1399 ; AVX1-NEXT: vmovups %ymm0, (%rcx)
1400 ; AVX1-NEXT: vzeroupper
1403 ; AVX2-LABEL: interleave_24i32_out:
1405 ; AVX2-NEXT: vmovups (%rdi), %ymm0
1406 ; AVX2-NEXT: vmovups 32(%rdi), %ymm1
1407 ; AVX2-NEXT: vmovups 64(%rdi), %ymm2
1408 ; AVX2-NEXT: vmovaps {{.*#+}} ymm3 = <u,u,u,u,u,u,2,5>
1409 ; AVX2-NEXT: vpermps %ymm2, %ymm3, %ymm3
1410 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
1411 ; AVX2-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u>
1412 ; AVX2-NEXT: vpermps %ymm4, %ymm5, %ymm4
1413 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
1414 ; AVX2-NEXT: vmovaps {{.*#+}} ymm4 = <u,u,u,u,u,0,3,6>
1415 ; AVX2-NEXT: vpermps %ymm2, %ymm4, %ymm4
1416 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1417 ; AVX2-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u>
1418 ; AVX2-NEXT: vpermps %ymm5, %ymm6, %ymm5
1419 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
1420 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
1421 ; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
1422 ; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
1423 ; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7]
1424 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
1425 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
1426 ; AVX2-NEXT: vmovups %ymm3, (%rsi)
1427 ; AVX2-NEXT: vmovups %ymm4, (%rdx)
1428 ; AVX2-NEXT: vmovups %ymm0, (%rcx)
1429 ; AVX2-NEXT: vzeroupper
1432 ; XOP-LABEL: interleave_24i32_out:
1434 ; XOP-NEXT: vmovups (%rdi), %ymm0
1435 ; XOP-NEXT: vmovups 32(%rdi), %ymm1
1436 ; XOP-NEXT: vmovups 64(%rdi), %ymm2
1437 ; XOP-NEXT: vextractf128 $1, %ymm2, %xmm3
1438 ; XOP-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm2[2],xmm3[1]
1439 ; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
1440 ; XOP-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
1441 ; XOP-NEXT: vextractf128 $1, %ymm5, %xmm6
1442 ; XOP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3]
1443 ; XOP-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,3,2,1]
1444 ; XOP-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,3,2,3]
1445 ; XOP-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
1446 ; XOP-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3]
1447 ; XOP-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm3[2],xmm2[3]
1448 ; XOP-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,0,3,2]
1449 ; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
1450 ; XOP-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1451 ; XOP-NEXT: vextractf128 $1, %ymm6, %xmm7
1452 ; XOP-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3]
1453 ; XOP-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[1,0,3,2]
1454 ; XOP-NEXT: vmovshdup {{.*#+}} xmm7 = xmm7[1,1,3,3]
1455 ; XOP-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
1456 ; XOP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7]
1457 ; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,3]
1458 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
1459 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
1460 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
1461 ; XOP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1462 ; XOP-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3]
1463 ; XOP-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1464 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1465 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
1466 ; XOP-NEXT: vmovupd %ymm4, (%rsi)
1467 ; XOP-NEXT: vmovups %ymm5, (%rdx)
1468 ; XOP-NEXT: vmovups %ymm0, (%rcx)
1469 ; XOP-NEXT: vzeroupper
1471 %wide.vec = load <24 x i32>, <24 x i32>* %p, align 4
1472 %s1 = shufflevector <24 x i32> %wide.vec, <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
1473 %s2 = shufflevector <24 x i32> %wide.vec, <24 x i32> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
1474 %s3 = shufflevector <24 x i32> %wide.vec, <24 x i32> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
1475 store <8 x i32> %s1, <8 x i32>* %q1, align 4
1476 store <8 x i32> %s2, <8 x i32>* %q2, align 4
1477 store <8 x i32> %s3, <8 x i32>* %q3, align 4
1481 define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, <8 x i32>* %q3) nounwind {
1482 ; SSE2-LABEL: interleave_24i32_in:
1484 ; SSE2-NEXT: movdqu (%rsi), %xmm5
1485 ; SSE2-NEXT: movdqu 16(%rsi), %xmm2
1486 ; SSE2-NEXT: movdqu (%rdx), %xmm6
1487 ; SSE2-NEXT: movdqu 16(%rdx), %xmm1
1488 ; SSE2-NEXT: movdqu (%rcx), %xmm7
1489 ; SSE2-NEXT: movdqu 16(%rcx), %xmm4
1490 ; SSE2-NEXT: movdqa %xmm5, %xmm0
1491 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
1492 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
1493 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,0,1]
1494 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[3,0]
1495 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,2]
1496 ; SSE2-NEXT: movdqa %xmm7, %xmm3
1497 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm6[1,0]
1498 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm6[2,2]
1499 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm5[2],xmm7[3],xmm5[3]
1500 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[3,0]
1501 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0,2]
1502 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,3,2,2]
1503 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,3,3]
1504 ; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[3,0]
1505 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,2]
1506 ; SSE2-NEXT: movdqa %xmm2, %xmm6
1507 ; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
1508 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,2]
1509 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,1,0,1]
1510 ; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm6[3,0]
1511 ; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm7[0,2]
1512 ; SSE2-NEXT: movdqa %xmm4, %xmm7
1513 ; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm1[1,0]
1514 ; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm1[2,2]
1515 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3]
1516 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[3,0]
1517 ; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm2[0,2]
1518 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,2]
1519 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
1520 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0]
1521 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
1522 ; SSE2-NEXT: movups %xmm2, 80(%rdi)
1523 ; SSE2-NEXT: movups %xmm7, 64(%rdi)
1524 ; SSE2-NEXT: movups %xmm6, 48(%rdi)
1525 ; SSE2-NEXT: movups %xmm5, 32(%rdi)
1526 ; SSE2-NEXT: movups %xmm3, 16(%rdi)
1527 ; SSE2-NEXT: movups %xmm0, (%rdi)
1530 ; SSE42-LABEL: interleave_24i32_in:
1532 ; SSE42-NEXT: movdqu (%rsi), %xmm5
1533 ; SSE42-NEXT: movdqu 16(%rsi), %xmm2
1534 ; SSE42-NEXT: movdqu (%rdx), %xmm6
1535 ; SSE42-NEXT: movdqu 16(%rdx), %xmm1
1536 ; SSE42-NEXT: movdqu (%rcx), %xmm7
1537 ; SSE42-NEXT: movdqu 16(%rcx), %xmm4
1538 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,1,1]
1539 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,1,0,1]
1540 ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3],xmm3[4,5,6,7]
1541 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,1]
1542 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5],xmm3[6,7]
1543 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,2,2]
1544 ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3],xmm3[4,5,6,7]
1545 ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5],xmm3[6,7]
1546 ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
1547 ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,3,2,3]
1548 ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm5[2,3],xmm7[4,5,6,7]
1549 ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,2,3,3]
1550 ; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4,5],xmm7[6,7]
1551 ; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,1,1]
1552 ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,1,0,1]
1553 ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7]
1554 ; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,1,0,1]
1555 ; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5],xmm7[6,7]
1556 ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,2,2]
1557 ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm4[2,3],xmm7[4,5,6,7]
1558 ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4,5],xmm7[6,7]
1559 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
1560 ; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
1561 ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3],xmm4[4,5,6,7]
1562 ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
1563 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5],xmm4[6,7]
1564 ; SSE42-NEXT: movdqu %xmm1, 80(%rdi)
1565 ; SSE42-NEXT: movdqu %xmm7, 64(%rdi)
1566 ; SSE42-NEXT: movdqu %xmm6, 48(%rdi)
1567 ; SSE42-NEXT: movdqu %xmm5, 32(%rdi)
1568 ; SSE42-NEXT: movdqu %xmm3, 16(%rdi)
1569 ; SSE42-NEXT: movdqu %xmm0, (%rdi)
1572 ; AVX1-LABEL: interleave_24i32_in:
1574 ; AVX1-NEXT: vmovups (%rsi), %ymm0
1575 ; AVX1-NEXT: vmovups (%rdx), %ymm1
1576 ; AVX1-NEXT: vmovupd (%rcx), %ymm2
1577 ; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,0],xmm1[2,0]
1578 ; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm1[1,1],xmm3[0,2]
1579 ; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm1[0,0],xmm0[0,0]
1580 ; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,1]
1581 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
1582 ; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = xmm2[0,0]
1583 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4
1584 ; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
1585 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
1586 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
1587 ; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm5[3,0],xmm4[3,0]
1588 ; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm4[2,1],xmm6[0,2]
1589 ; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],xmm5[1,0]
1590 ; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm5[2,2]
1591 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
1592 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm5 = ymm0[1,1,3,3]
1593 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3]
1594 ; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
1595 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
1596 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[1,1,2,2]
1597 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
1598 ; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
1599 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1600 ; AVX1-NEXT: vmovupd %ymm0, 32(%rdi)
1601 ; AVX1-NEXT: vmovupd %ymm4, 64(%rdi)
1602 ; AVX1-NEXT: vmovups %ymm3, (%rdi)
1603 ; AVX1-NEXT: vzeroupper
1606 ; AVX2-LABEL: interleave_24i32_in:
1608 ; AVX2-NEXT: vmovups (%rsi), %ymm0
1609 ; AVX2-NEXT: vmovups (%rdx), %ymm1
1610 ; AVX2-NEXT: vmovups (%rcx), %ymm2
1611 ; AVX2-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[1,0,2,2]
1612 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,1]
1613 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1]
1614 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
1615 ; AVX2-NEXT: vbroadcastsd %xmm2, %ymm4
1616 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
1617 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3]
1618 ; AVX2-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7]
1619 ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3]
1620 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7]
1621 ; AVX2-NEXT: vbroadcastsd 24(%rsi), %ymm5
1622 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
1623 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
1624 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2]
1625 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
1626 ; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
1627 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1628 ; AVX2-NEXT: vmovups %ymm0, 32(%rdi)
1629 ; AVX2-NEXT: vmovups %ymm4, 64(%rdi)
1630 ; AVX2-NEXT: vmovups %ymm3, (%rdi)
1631 ; AVX2-NEXT: vzeroupper
1634 ; XOP-LABEL: interleave_24i32_in:
1636 ; XOP-NEXT: vmovups (%rsi), %ymm0
1637 ; XOP-NEXT: vmovups (%rdx), %ymm1
1638 ; XOP-NEXT: vmovupd (%rcx), %ymm2
1639 ; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,0],xmm1[2,0]
1640 ; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm1[1,1],xmm3[0,2]
1641 ; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm1[0,0],xmm0[0,0]
1642 ; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,1]
1643 ; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
1644 ; XOP-NEXT: vmovddup {{.*#+}} xmm4 = xmm2[0,0]
1645 ; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4
1646 ; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
1647 ; XOP-NEXT: vextractf128 $1, %ymm2, %xmm4
1648 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm5
1649 ; XOP-NEXT: vshufps {{.*#+}} xmm6 = xmm5[3,0],xmm4[3,0]
1650 ; XOP-NEXT: vshufps {{.*#+}} xmm6 = xmm4[2,1],xmm6[0,2]
1651 ; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],xmm5[1,0]
1652 ; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm5[2,2]
1653 ; XOP-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
1654 ; XOP-NEXT: vpermilpd {{.*#+}} ymm5 = ymm0[1,1,3,3]
1655 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3]
1656 ; XOP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
1657 ; XOP-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm2[2],ymm0[3],ymm2[2,3],ymm0[4],ymm2[5,4],ymm0[5]
1658 ; XOP-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
1659 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1660 ; XOP-NEXT: vmovups %ymm0, 32(%rdi)
1661 ; XOP-NEXT: vmovupd %ymm4, 64(%rdi)
1662 ; XOP-NEXT: vmovups %ymm3, (%rdi)
1663 ; XOP-NEXT: vzeroupper
1665 %s1 = load <8 x i32>, <8 x i32>* %q1, align 4
1666 %s2 = load <8 x i32>, <8 x i32>* %q2, align 4
1667 %s3 = load <8 x i32>, <8 x i32>* %q3, align 4
1668 %t1 = shufflevector <8 x i32> %s1, <8 x i32> %s2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1669 %t2 = shufflevector <8 x i32> %s3, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1670 %interleaved = shufflevector <16 x i32> %t1, <16 x i32> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
1671 store <24 x i32> %interleaved, <24 x i32>* %p, align 4
1675 define <2 x double> @wrongorder(<4 x double> %A, <8 x double>* %P) #0 {
1676 ; SSE2-LABEL: wrongorder:
1678 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
1679 ; SSE2-NEXT: movaps %xmm0, 48(%rdi)
1680 ; SSE2-NEXT: movaps %xmm0, 32(%rdi)
1681 ; SSE2-NEXT: movaps %xmm0, 16(%rdi)
1682 ; SSE2-NEXT: movaps %xmm0, (%rdi)
1685 ; SSE42-LABEL: wrongorder:
1687 ; SSE42-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
1688 ; SSE42-NEXT: movapd %xmm0, 48(%rdi)
1689 ; SSE42-NEXT: movapd %xmm0, 32(%rdi)
1690 ; SSE42-NEXT: movapd %xmm0, 16(%rdi)
1691 ; SSE42-NEXT: movapd %xmm0, (%rdi)
1694 ; AVX1-LABEL: wrongorder:
1696 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1697 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
1698 ; AVX1-NEXT: vmovaps %ymm1, 32(%rdi)
1699 ; AVX1-NEXT: vmovaps %ymm1, (%rdi)
1700 ; AVX1-NEXT: # kill: %xmm0<def> %xmm0<kill> %ymm0<kill>
1701 ; AVX1-NEXT: vzeroupper
1704 ; AVX2-LABEL: wrongorder:
1706 ; AVX2-NEXT: vbroadcastsd %xmm0, %ymm1
1707 ; AVX2-NEXT: vmovapd %ymm1, 32(%rdi)
1708 ; AVX2-NEXT: vmovapd %ymm1, (%rdi)
1709 ; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1710 ; AVX2-NEXT: vzeroupper
1713 ; XOP-LABEL: wrongorder:
1715 ; XOP-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1716 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
1717 ; XOP-NEXT: vmovaps %ymm1, 32(%rdi)
1718 ; XOP-NEXT: vmovaps %ymm1, (%rdi)
1719 ; XOP-NEXT: # kill: %xmm0<def> %xmm0<kill> %ymm0<kill>
1720 ; XOP-NEXT: vzeroupper
1722 %shuffle = shufflevector <4 x double> %A, <4 x double> %A, <8 x i32> zeroinitializer
1723 store <8 x double> %shuffle, <8 x double>* %P, align 64
1724 %m2 = load <8 x double>, <8 x double>* %P, align 64
1725 store <8 x double> %m2, <8 x double>* %P, align 64
1726 %m3 = load <8 x double>, <8 x double>* %P, align 64
1727 %m4 = shufflevector <8 x double> %m3, <8 x double> undef, <2 x i32> <i32 2, i32 0>
1728 ret <2 x double> %m4