1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=SSE
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
10 define <64 x i16> @interleave8x8(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i16> %d, <8 x i16> %e, <8 x i16> %f, <8 x i16> %h, <8 x i16> %g) {
11 ; SSE-LABEL: interleave8x8:
13 ; SSE-NEXT: movq %rdi, %rax
14 ; SSE-NEXT: movdqa %xmm0, %xmm8
15 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3]
16 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
17 ; SSE-NEXT: movdqa %xmm2, %xmm9
18 ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3]
19 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
20 ; SSE-NEXT: movdqa %xmm0, %xmm3
21 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
22 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
23 ; SSE-NEXT: movdqa %xmm8, %xmm1
24 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3]
25 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
26 ; SSE-NEXT: movdqa %xmm4, %xmm2
27 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
28 ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
29 ; SSE-NEXT: movdqa %xmm7, %xmm5
30 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
31 ; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
32 ; SSE-NEXT: movdqa %xmm4, %xmm6
33 ; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
34 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
35 ; SSE-NEXT: movdqa %xmm2, %xmm7
36 ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm5[2],xmm7[3],xmm5[3]
37 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
38 ; SSE-NEXT: movdqa %xmm8, %xmm5
39 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
40 ; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7]
41 ; SSE-NEXT: movdqa %xmm1, %xmm2
42 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
43 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
44 ; SSE-NEXT: movdqa %xmm0, %xmm7
45 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
46 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
47 ; SSE-NEXT: movdqa %xmm3, %xmm4
48 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
49 ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
50 ; SSE-NEXT: movdqa %xmm3, 112(%rdi)
51 ; SSE-NEXT: movdqa %xmm4, 96(%rdi)
52 ; SSE-NEXT: movdqa %xmm0, 80(%rdi)
53 ; SSE-NEXT: movdqa %xmm7, 64(%rdi)
54 ; SSE-NEXT: movdqa %xmm1, 48(%rdi)
55 ; SSE-NEXT: movdqa %xmm2, 32(%rdi)
56 ; SSE-NEXT: movdqa %xmm8, 16(%rdi)
57 ; SSE-NEXT: movdqa %xmm5, (%rdi)
60 ; AVX1-LABEL: interleave8x8:
62 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
63 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
64 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
65 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
66 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
67 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
68 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm8[2],xmm1[2],xmm8[3],xmm1[3]
69 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
70 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
71 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
72 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
73 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
74 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm4[2],xmm6[2],xmm4[3],xmm6[3]
75 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
76 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
77 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
78 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
79 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
80 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
81 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
82 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
83 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1
84 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
85 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
86 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
87 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
88 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
89 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
92 ; AVX2-LABEL: interleave8x8:
94 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
95 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
96 ; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0
97 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
98 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
99 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
100 ; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
101 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
102 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm0[2,3],ymm2[2,3]
103 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm2[0,1]
104 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
105 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
106 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
107 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
108 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
109 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
110 ; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
111 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5]
112 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],ymm4[2,3]
113 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1]
114 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
115 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
116 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[0,1],ymm4[0,1]
117 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3]
118 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
119 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
120 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm3[0,1],ymm4[0,1]
121 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3]
123 %ab = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
124 %cd = shufflevector <8 x i16> %c, <8 x i16> %d, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
125 %ab32 = bitcast <16 x i16> %ab to <8 x i32>
126 %cd32 = bitcast <16 x i16> %cd to <8 x i32>
127 %abcd32 = shufflevector <8 x i32> %ab32, <8 x i32> %cd32, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
128 %abcd = bitcast <16 x i32> %abcd32 to <32 x i16>
130 %ef = shufflevector <8 x i16> %e, <8 x i16> %f, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
131 %gh = shufflevector <8 x i16> %g, <8 x i16> %h, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
132 %ef32 = bitcast <16 x i16> %ef to <8 x i32>
133 %gh32 = bitcast <16 x i16> %gh to <8 x i32>
134 %efgh32 = shufflevector <8 x i32> %ef32, <8 x i32> %gh32, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
135 %efgh = bitcast <16 x i32> %efgh32 to <32 x i16>
137 %result = shufflevector <32 x i16> %abcd, <32 x i16> %efgh, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
138 ret <64 x i16> %result
141 define <8 x double> @interleave2x4f64(<4 x double> %a, <4 x double> %b) {
142 ; SSE-LABEL: interleave2x4f64:
144 ; SSE-NEXT: movaps %xmm0, %xmm4
145 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
146 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
147 ; SSE-NEXT: movaps %xmm1, %xmm2
148 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
149 ; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm1[1],xmm3[1]
150 ; SSE-NEXT: movaps %xmm4, %xmm1
153 ; AVX1-LABEL: interleave2x4f64:
155 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,2,3]
156 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
157 ; AVX1-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[3],ymm2[3]
158 ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm0[1],xmm1[1]
159 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
160 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
161 ; AVX1-NEXT: vmovapd %ymm2, %ymm1
164 ; AVX2-LABEL: interleave2x4f64:
166 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
167 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3]
168 ; AVX2-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3]
169 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
170 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,2,3]
171 ; AVX2-NEXT: vshufpd {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[3],ymm1[3]
172 ; AVX2-NEXT: vmovapd %ymm2, %ymm0
174 %result = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
175 ret <8 x double> %result
178 define <8 x i64> @interleave2x4i64(<4 x i64> %a, <4 x i64> %b) {
179 ; SSE-LABEL: interleave2x4i64:
181 ; SSE-NEXT: movaps %xmm1, %xmm4
182 ; SSE-NEXT: movaps %xmm0, %xmm1
183 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
184 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
185 ; SSE-NEXT: movaps %xmm4, %xmm2
186 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
187 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
188 ; SSE-NEXT: movaps %xmm4, %xmm3
191 ; AVX1-LABEL: interleave2x4i64:
193 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,2,3]
194 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
195 ; AVX1-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[3],ymm2[3]
196 ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm0[1],xmm1[1]
197 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
198 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
199 ; AVX1-NEXT: vmovapd %ymm2, %ymm1
202 ; AVX2-LABEL: interleave2x4i64:
204 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
205 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3]
206 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
207 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
208 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
209 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
210 ; AVX2-NEXT: vmovaps %ymm2, %ymm0
212 %result = shufflevector <4 x i64> %a, <4 x i64> %b, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
213 ret <8 x i64> %result
216 define <16 x float> @interleave2x8f32(<8 x float> %a, <8 x float> %b) {
217 ; SSE-LABEL: interleave2x8f32:
219 ; SSE-NEXT: movaps %xmm1, %xmm4
220 ; SSE-NEXT: movaps %xmm0, %xmm1
221 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
222 ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
223 ; SSE-NEXT: movaps %xmm4, %xmm2
224 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
225 ; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
226 ; SSE-NEXT: movaps %xmm4, %xmm3
229 ; AVX1-LABEL: interleave2x8f32:
231 ; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
232 ; AVX1-NEXT: vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
233 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
234 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
235 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
236 ; AVX1-NEXT: vunpckhps {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
237 ; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
238 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1
239 ; AVX1-NEXT: vmovaps %ymm2, %ymm0
242 ; AVX2-LABEL: interleave2x8f32:
244 ; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
245 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
246 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm2[0,1]
247 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
249 %result = shufflevector <8 x float> %a, <8 x float> %b, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
250 ret <16 x float> %result
253 define <16 x i32> @interleave2x8i32(<8 x i32> %a, <8 x i32> %b) {
254 ; SSE-LABEL: interleave2x8i32:
256 ; SSE-NEXT: movaps %xmm1, %xmm4
257 ; SSE-NEXT: movaps %xmm0, %xmm1
258 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
259 ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
260 ; SSE-NEXT: movaps %xmm4, %xmm2
261 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
262 ; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
263 ; SSE-NEXT: movaps %xmm4, %xmm3
266 ; AVX1-LABEL: interleave2x8i32:
268 ; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
269 ; AVX1-NEXT: vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
270 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
271 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
272 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
273 ; AVX1-NEXT: vunpckhps {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
274 ; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
275 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1
276 ; AVX1-NEXT: vmovaps %ymm2, %ymm0
279 ; AVX2-LABEL: interleave2x8i32:
281 ; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
282 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
283 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm2[0,1]
284 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
286 %result = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
287 ret <16 x i32> %result
290 define <32 x i16> @interleave2x16i16(<16 x i16> %a, <16 x i16> %b) {
291 ; SSE-LABEL: interleave2x16i16:
293 ; SSE-NEXT: movdqa %xmm1, %xmm4
294 ; SSE-NEXT: movdqa %xmm0, %xmm1
295 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
296 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
297 ; SSE-NEXT: movdqa %xmm4, %xmm2
298 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
299 ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
300 ; SSE-NEXT: movdqa %xmm4, %xmm3
303 ; AVX1-LABEL: interleave2x16i16:
305 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
306 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
307 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
308 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
309 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
310 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
311 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
312 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1
313 ; AVX1-NEXT: vmovaps %ymm2, %ymm0
316 ; AVX2-LABEL: interleave2x16i16:
318 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
319 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
320 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[0,1],ymm2[0,1]
321 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
323 %result = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
324 ret <32 x i16> %result
327 define <64 x i16> @interleave2x32i16(<32 x i16> %a, <32 x i16> %b) {
328 ; SSE-LABEL: interleave2x32i16:
330 ; SSE-NEXT: movq %rdi, %rax
331 ; SSE-NEXT: movdqa %xmm0, %xmm8
332 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
333 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
334 ; SSE-NEXT: movdqa %xmm1, %xmm4
335 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
336 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
337 ; SSE-NEXT: movdqa %xmm2, %xmm5
338 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
339 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
340 ; SSE-NEXT: movdqa %xmm3, %xmm6
341 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
342 ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
343 ; SSE-NEXT: movdqa %xmm3, 112(%rdi)
344 ; SSE-NEXT: movdqa %xmm6, 96(%rdi)
345 ; SSE-NEXT: movdqa %xmm2, 80(%rdi)
346 ; SSE-NEXT: movdqa %xmm5, 64(%rdi)
347 ; SSE-NEXT: movdqa %xmm1, 48(%rdi)
348 ; SSE-NEXT: movdqa %xmm4, 32(%rdi)
349 ; SSE-NEXT: movdqa %xmm0, 16(%rdi)
350 ; SSE-NEXT: movdqa %xmm8, (%rdi)
353 ; AVX1-LABEL: interleave2x32i16:
355 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
356 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
357 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
358 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
359 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
360 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
361 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
362 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
363 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
364 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
365 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2
366 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm0
367 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
368 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
369 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
370 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
371 ; AVX1-NEXT: vmovaps %ymm4, %ymm0
372 ; AVX1-NEXT: vmovaps %ymm5, %ymm1
375 ; AVX2-LABEL: interleave2x32i16:
377 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
378 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
379 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[0,1],ymm4[0,1]
380 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm2[2,3],ymm4[2,3]
381 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15]
382 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11]
383 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[0,1],ymm5[0,1]
384 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm5[2,3]
385 ; AVX2-NEXT: vmovdqa %ymm4, %ymm1
387 %result = shufflevector <32 x i16> %a, <32 x i16> %b, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
388 ret <64 x i16> %result
391 define <64 x i8> @interleave2x32i8(<32 x i8> %a, <32 x i8> %b) {
392 ; SSE-LABEL: interleave2x32i8:
394 ; SSE-NEXT: movdqa %xmm1, %xmm4
395 ; SSE-NEXT: movdqa %xmm0, %xmm1
396 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
397 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
398 ; SSE-NEXT: movdqa %xmm4, %xmm2
399 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
400 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
401 ; SSE-NEXT: movdqa %xmm4, %xmm3
404 ; AVX1-LABEL: interleave2x32i8:
406 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
407 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
408 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
409 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
410 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
411 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
412 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
413 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1
414 ; AVX1-NEXT: vmovaps %ymm2, %ymm0
417 ; AVX2-LABEL: interleave2x32i8:
419 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
420 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
421 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[0,1],ymm2[0,1]
422 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
424 %result = shufflevector <32 x i8> %a, <32 x i8> %b, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
425 ret <64 x i8> %result
428 define void @splat2_i8(ptr %s, ptr %d) {
429 ; SSE-LABEL: splat2_i8:
431 ; SSE-NEXT: movdqu (%rdi), %xmm0
432 ; SSE-NEXT: movdqu 16(%rdi), %xmm1
433 ; SSE-NEXT: movdqa %xmm0, %xmm2
434 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
435 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
436 ; SSE-NEXT: movdqa %xmm1, %xmm3
437 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
438 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
439 ; SSE-NEXT: movdqu %xmm1, 48(%rsi)
440 ; SSE-NEXT: movdqu %xmm3, 32(%rsi)
441 ; SSE-NEXT: movdqu %xmm0, 16(%rsi)
442 ; SSE-NEXT: movdqu %xmm2, (%rsi)
445 ; AVX1-LABEL: splat2_i8:
447 ; AVX1-NEXT: vmovdqu (%rdi), %xmm0
448 ; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1
449 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
450 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
451 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
452 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
453 ; AVX1-NEXT: vmovdqu %xmm1, 48(%rsi)
454 ; AVX1-NEXT: vmovdqu %xmm3, 32(%rsi)
455 ; AVX1-NEXT: vmovdqu %xmm0, 16(%rsi)
456 ; AVX1-NEXT: vmovdqu %xmm2, (%rsi)
459 ; AVX2-LABEL: splat2_i8:
461 ; AVX2-NEXT: vpermq $216, (%rdi), %ymm0 # ymm0 = mem[0,2,1,3]
462 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
463 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
464 ; AVX2-NEXT: vmovdqu %ymm0, 32(%rsi)
465 ; AVX2-NEXT: vmovdqu %ymm1, (%rsi)
466 ; AVX2-NEXT: vzeroupper
468 %ld32 = load <32 x i8>, ptr %s, align 1
469 %cat = shufflevector <32 x i8> %ld32, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
470 %cat2 = shufflevector <64 x i8> %cat, <64 x i8> undef, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
471 store <64 x i8> %cat2, ptr %d, align 1
475 define void @splat2_i16(ptr %s, ptr %d) {
476 ; SSE-LABEL: splat2_i16:
478 ; SSE-NEXT: movdqu (%rdi), %xmm0
479 ; SSE-NEXT: movdqu 16(%rdi), %xmm1
480 ; SSE-NEXT: movdqa %xmm0, %xmm2
481 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
482 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
483 ; SSE-NEXT: movdqa %xmm1, %xmm3
484 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
485 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
486 ; SSE-NEXT: movdqu %xmm1, 48(%rsi)
487 ; SSE-NEXT: movdqu %xmm3, 32(%rsi)
488 ; SSE-NEXT: movdqu %xmm0, 16(%rsi)
489 ; SSE-NEXT: movdqu %xmm2, (%rsi)
492 ; AVX1-LABEL: splat2_i16:
494 ; AVX1-NEXT: vmovdqu (%rdi), %xmm0
495 ; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1
496 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3]
497 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
498 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0,0,1,1,2,2,3,3]
499 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
500 ; AVX1-NEXT: vmovdqu %xmm1, 48(%rsi)
501 ; AVX1-NEXT: vmovdqu %xmm3, 32(%rsi)
502 ; AVX1-NEXT: vmovdqu %xmm0, 16(%rsi)
503 ; AVX1-NEXT: vmovdqu %xmm2, (%rsi)
506 ; AVX2-LABEL: splat2_i16:
508 ; AVX2-NEXT: vpermq $216, (%rdi), %ymm0 # ymm0 = mem[0,2,1,3]
509 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
510 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
511 ; AVX2-NEXT: vmovdqu %ymm0, 32(%rsi)
512 ; AVX2-NEXT: vmovdqu %ymm1, (%rsi)
513 ; AVX2-NEXT: vzeroupper
515 %ld32 = load <16 x i16>, ptr %s, align 1
516 %cat = shufflevector <16 x i16> %ld32, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
517 %cat2 = shufflevector <32 x i16> %cat, <32 x i16> undef, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
518 store <32 x i16> %cat2, ptr %d, align 1
522 define void @splat2_i32(ptr %s, ptr %d) {
523 ; SSE-LABEL: splat2_i32:
525 ; SSE-NEXT: movdqu (%rdi), %xmm0
526 ; SSE-NEXT: movdqu 16(%rdi), %xmm1
527 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
528 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
529 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
530 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
531 ; SSE-NEXT: movdqu %xmm1, 48(%rsi)
532 ; SSE-NEXT: movdqu %xmm3, 32(%rsi)
533 ; SSE-NEXT: movdqu %xmm0, 16(%rsi)
534 ; SSE-NEXT: movdqu %xmm2, (%rsi)
537 ; AVX1-LABEL: splat2_i32:
539 ; AVX1-NEXT: vmovups (%rdi), %xmm0
540 ; AVX1-NEXT: vmovups 16(%rdi), %xmm1
541 ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,0,1,1]
542 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
543 ; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,0,1,1]
544 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,2,3,3]
545 ; AVX1-NEXT: vmovups %xmm1, 48(%rsi)
546 ; AVX1-NEXT: vmovups %xmm3, 32(%rsi)
547 ; AVX1-NEXT: vmovups %xmm0, 16(%rsi)
548 ; AVX1-NEXT: vmovups %xmm2, (%rsi)
551 ; AVX2-LABEL: splat2_i32:
553 ; AVX2-NEXT: vpermpd $216, (%rdi), %ymm0 # ymm0 = mem[0,2,1,3]
554 ; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,0,1,1,4,4,5,5]
555 ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
556 ; AVX2-NEXT: vmovups %ymm0, 32(%rsi)
557 ; AVX2-NEXT: vmovups %ymm1, (%rsi)
558 ; AVX2-NEXT: vzeroupper
560 %ld32 = load <8 x i32>, ptr %s, align 1
561 %cat = shufflevector <8 x i32> %ld32, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
562 %cat2 = shufflevector <16 x i32> %cat, <16 x i32> undef, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
563 store <16 x i32> %cat2, ptr %d, align 1
567 define void @splat2_i64(ptr %s, ptr %d) {
568 ; SSE-LABEL: splat2_i64:
570 ; SSE-NEXT: movdqu (%rdi), %xmm0
571 ; SSE-NEXT: movdqu 16(%rdi), %xmm1
572 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
573 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
574 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1]
575 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
576 ; SSE-NEXT: movdqu %xmm1, 48(%rsi)
577 ; SSE-NEXT: movdqu %xmm3, 32(%rsi)
578 ; SSE-NEXT: movdqu %xmm0, 16(%rsi)
579 ; SSE-NEXT: movdqu %xmm2, (%rsi)
582 ; AVX1-LABEL: splat2_i64:
584 ; AVX1-NEXT: vperm2f128 $51, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3,2,3]
585 ; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0,0,3,3]
586 ; AVX1-NEXT: vbroadcastf128 (%rdi), %ymm1 # ymm1 = mem[0,1,0,1]
587 ; AVX1-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,3]
588 ; AVX1-NEXT: vmovupd %ymm0, 32(%rsi)
589 ; AVX1-NEXT: vmovupd %ymm1, (%rsi)
590 ; AVX1-NEXT: vzeroupper
593 ; AVX2-LABEL: splat2_i64:
595 ; AVX2-NEXT: vmovups (%rdi), %ymm0
596 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,1,1]
597 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3]
598 ; AVX2-NEXT: vmovups %ymm0, 32(%rsi)
599 ; AVX2-NEXT: vmovups %ymm1, (%rsi)
600 ; AVX2-NEXT: vzeroupper
602 %ld32 = load <4 x i64>, ptr %s, align 1
603 %cat = shufflevector <4 x i64> %ld32, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
604 %cat2 = shufflevector <8 x i64> %cat, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
605 store <8 x i64> %cat2, ptr %d, align 1