1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-ALL
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-PERLANE
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512
9 ; These patterns are produced by LoopVectorizer for interleaved stores.
11 define void @vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %out.vec) nounwind {
14 ; SSE-NEXT: movdqa (%rdi), %xmm0
15 ; SSE-NEXT: movdqa (%rdx), %xmm1
16 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
17 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
18 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
19 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
20 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
21 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7]
22 ; SSE-NEXT: movdqa %xmm0, (%r8)
27 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
28 ; AVX-NEXT: vmovdqa (%rdx), %xmm1
29 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
30 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
31 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
32 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15]
33 ; AVX-NEXT: vmovdqa %xmm0, (%r8)
38 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
39 ; AVX512-NEXT: vmovdqa (%rdx), %xmm1
40 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
41 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
42 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2,8,10,1,3,9,11]
43 ; AVX512-NEXT: vpermi2w %xmm1, %xmm0, %xmm2
44 ; AVX512-NEXT: vmovdqa %xmm2, (%r8)
46 %in.vec0 = load <2 x i16>, ptr %in.vecptr0, align 32
47 %in.vec1 = load <2 x i16>, ptr %in.vecptr1, align 32
48 %in.vec2 = load <2 x i16>, ptr %in.vecptr2, align 32
49 %in.vec3 = load <2 x i16>, ptr %in.vecptr3, align 32
51 %concat01 = shufflevector <2 x i16> %in.vec0, <2 x i16> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
52 %concat23 = shufflevector <2 x i16> %in.vec2, <2 x i16> %in.vec3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
53 %concat0123 = shufflevector <4 x i16> %concat01, <4 x i16> %concat23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
54 %interleaved.vec = shufflevector <8 x i16> %concat0123, <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
56 store <8 x i16> %interleaved.vec, ptr %out.vec, align 32
61 define void @vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %out.vec) nounwind {
64 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
65 ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
66 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
67 ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
68 ; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
69 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
70 ; SSE-NEXT: movdqa %xmm0, %xmm2
71 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
72 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
73 ; SSE-NEXT: movdqa %xmm0, 16(%r8)
74 ; SSE-NEXT: movdqa %xmm2, (%r8)
79 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
80 ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
81 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
82 ; AVX1-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
83 ; AVX1-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
84 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm4[0],xmm3[0]
85 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
86 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
87 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
88 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[3,1,2,3]
89 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,3,1,4,5,6,7]
90 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
91 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,3,1,4,5,6,7]
92 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
93 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
94 ; AVX1-NEXT: vmovaps %ymm0, (%r8)
95 ; AVX1-NEXT: vzeroupper
98 ; AVX2-SLOW-LABEL: vf4:
100 ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
101 ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
102 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
103 ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
104 ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
105 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
106 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
107 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,22,23,30,31]
108 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
109 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,20,21,28,29,u,u,u,u,22,23,30,31,u,u,u,u]
110 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7]
111 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8)
112 ; AVX2-SLOW-NEXT: vzeroupper
113 ; AVX2-SLOW-NEXT: retq
115 ; AVX2-FAST-ALL-LABEL: vf4:
116 ; AVX2-FAST-ALL: # %bb.0:
117 ; AVX2-FAST-ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
118 ; AVX2-FAST-ALL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
119 ; AVX2-FAST-ALL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
120 ; AVX2-FAST-ALL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
121 ; AVX2-FAST-ALL-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
122 ; AVX2-FAST-ALL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
123 ; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
124 ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7]
125 ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
126 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31]
127 ; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, (%r8)
128 ; AVX2-FAST-ALL-NEXT: vzeroupper
129 ; AVX2-FAST-ALL-NEXT: retq
131 ; AVX2-FAST-PERLANE-LABEL: vf4:
132 ; AVX2-FAST-PERLANE: # %bb.0:
133 ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
134 ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
135 ; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
136 ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
137 ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
138 ; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
139 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
140 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,22,23,30,31]
141 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
142 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,20,21,28,29,u,u,u,u,22,23,30,31,u,u,u,u]
143 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7]
144 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r8)
145 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
146 ; AVX2-FAST-PERLANE-NEXT: retq
150 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
151 ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
152 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
153 ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
154 ; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
155 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
156 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
157 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
158 ; AVX512-NEXT: vpermw %ymm0, %ymm1, %ymm0
159 ; AVX512-NEXT: vmovdqa %ymm0, (%r8)
160 ; AVX512-NEXT: vzeroupper
162 %in.vec0 = load <4 x i16>, ptr %in.vecptr0, align 32
163 %in.vec1 = load <4 x i16>, ptr %in.vecptr1, align 32
164 %in.vec2 = load <4 x i16>, ptr %in.vecptr2, align 32
165 %in.vec3 = load <4 x i16>, ptr %in.vecptr3, align 32
167 %concat01 = shufflevector <4 x i16> %in.vec0, <4 x i16> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
168 %concat23 = shufflevector <4 x i16> %in.vec2, <4 x i16> %in.vec3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
169 %concat0123 = shufflevector <8 x i16> %concat01, <8 x i16> %concat23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
170 %interleaved.vec = shufflevector <16 x i16> %concat0123, <16 x i16> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
172 store <16 x i16> %interleaved.vec, ptr %out.vec, align 32
177 define void @vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %out.vec) nounwind {
180 ; SSE-NEXT: movdqa (%rdi), %xmm0
181 ; SSE-NEXT: movdqa (%rsi), %xmm1
182 ; SSE-NEXT: movdqa (%rdx), %xmm2
183 ; SSE-NEXT: movdqa (%rcx), %xmm3
184 ; SSE-NEXT: movdqa %xmm2, %xmm4
185 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
186 ; SSE-NEXT: movdqa %xmm0, %xmm5
187 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
188 ; SSE-NEXT: movdqa %xmm5, %xmm6
189 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
190 ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
191 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
192 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
193 ; SSE-NEXT: movdqa %xmm0, %xmm1
194 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
195 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
196 ; SSE-NEXT: movdqa %xmm0, 48(%r8)
197 ; SSE-NEXT: movdqa %xmm1, 32(%r8)
198 ; SSE-NEXT: movdqa %xmm5, 16(%r8)
199 ; SSE-NEXT: movdqa %xmm6, (%r8)
204 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
205 ; AVX1-NEXT: vmovdqa (%rsi), %xmm1
206 ; AVX1-NEXT: vmovdqa (%rdx), %xmm2
207 ; AVX1-NEXT: vmovdqa (%rcx), %xmm3
208 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
209 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
210 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
211 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
212 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
213 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
214 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
215 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
216 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
217 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
218 ; AVX1-NEXT: vmovaps %ymm0, 32(%r8)
219 ; AVX1-NEXT: vmovaps %ymm4, (%r8)
220 ; AVX1-NEXT: vzeroupper
225 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
226 ; AVX2-NEXT: vmovdqa (%rdx), %xmm1
227 ; AVX2-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
228 ; AVX2-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
229 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,2,0,2]
230 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15>
231 ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
232 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,0,2]
233 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = <0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u>
234 ; AVX2-NEXT: vpshufb %ymm5, %ymm4, %ymm4
235 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7]
236 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3]
237 ; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1
238 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3]
239 ; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm0
240 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
241 ; AVX2-NEXT: vmovdqa %ymm0, 32(%r8)
242 ; AVX2-NEXT: vmovdqa %ymm2, (%r8)
243 ; AVX2-NEXT: vzeroupper
248 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
249 ; AVX512-NEXT: vmovdqa (%rdx), %xmm1
250 ; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
251 ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
252 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
253 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27,4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31]
254 ; AVX512-NEXT: vpermw %zmm0, %zmm1, %zmm0
255 ; AVX512-NEXT: vmovdqu64 %zmm0, (%r8)
256 ; AVX512-NEXT: vzeroupper
258 %in.vec0 = load <8 x i16>, ptr %in.vecptr0, align 32
259 %in.vec1 = load <8 x i16>, ptr %in.vecptr1, align 32
260 %in.vec2 = load <8 x i16>, ptr %in.vecptr2, align 32
261 %in.vec3 = load <8 x i16>, ptr %in.vecptr3, align 32
263 %concat01 = shufflevector <8 x i16> %in.vec0, <8 x i16> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
264 %concat23 = shufflevector <8 x i16> %in.vec2, <8 x i16> %in.vec3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
265 %concat0123 = shufflevector <16 x i16> %concat01, <16 x i16> %concat23, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
266 %interleaved.vec = shufflevector <32 x i16> %concat0123, <32 x i16> poison, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
268 store <32 x i16> %interleaved.vec, ptr %out.vec, align 32
273 define void @vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %out.vec) nounwind {
276 ; SSE-NEXT: movdqa (%rdi), %xmm0
277 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
278 ; SSE-NEXT: movdqa (%rsi), %xmm5
279 ; SSE-NEXT: movdqa 16(%rsi), %xmm8
280 ; SSE-NEXT: movdqa (%rdx), %xmm3
281 ; SSE-NEXT: movdqa 16(%rdx), %xmm4
282 ; SSE-NEXT: movdqa (%rcx), %xmm6
283 ; SSE-NEXT: movdqa 16(%rcx), %xmm9
284 ; SSE-NEXT: movdqa %xmm3, %xmm7
285 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
286 ; SSE-NEXT: movdqa %xmm0, %xmm2
287 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
288 ; SSE-NEXT: movdqa %xmm2, %xmm10
289 ; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm7[2],xmm10[3],xmm7[3]
290 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1]
291 ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
292 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
293 ; SSE-NEXT: movdqa %xmm0, %xmm5
294 ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
295 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
296 ; SSE-NEXT: movdqa %xmm4, %xmm3
297 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
298 ; SSE-NEXT: movdqa %xmm1, %xmm6
299 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3]
300 ; SSE-NEXT: movdqa %xmm6, %xmm7
301 ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3]
302 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
303 ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
304 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
305 ; SSE-NEXT: movdqa %xmm1, %xmm3
306 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
307 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
308 ; SSE-NEXT: movdqa %xmm1, 96(%r8)
309 ; SSE-NEXT: movdqa %xmm3, 112(%r8)
310 ; SSE-NEXT: movdqa %xmm6, 64(%r8)
311 ; SSE-NEXT: movdqa %xmm7, 80(%r8)
312 ; SSE-NEXT: movdqa %xmm0, 32(%r8)
313 ; SSE-NEXT: movdqa %xmm5, 48(%r8)
314 ; SSE-NEXT: movdqa %xmm2, (%r8)
315 ; SSE-NEXT: movdqa %xmm10, 16(%r8)
320 ; AVX1-NEXT: vmovdqa (%rcx), %xmm8
321 ; AVX1-NEXT: vmovdqa 16(%rcx), %xmm5
322 ; AVX1-NEXT: vmovdqa (%rdx), %xmm9
323 ; AVX1-NEXT: vmovdqa 16(%rdx), %xmm6
324 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
325 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
326 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
327 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm3
328 ; AVX1-NEXT: vmovdqa (%rsi), %xmm2
329 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm7
330 ; AVX1-NEXT: vmovdqa (%rdi), %xmm4
331 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm0
332 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
333 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm1[0],zero,xmm1[1],zero
334 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
335 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm10, %ymm1
336 ; AVX1-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7]
337 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
338 ; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,0,1,1]
339 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
340 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm10, %ymm1
341 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
342 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm3[0],zero,xmm3[1],zero
343 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
344 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3
345 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7]
346 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
347 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,1,1]
348 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
349 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
350 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
351 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero
352 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
353 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0
354 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7]
355 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
356 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,1,1]
357 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
358 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
359 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
360 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero
361 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
362 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
363 ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
364 ; AVX1-NEXT: vmovaps %ymm2, (%r8)
365 ; AVX1-NEXT: vmovaps %ymm0, 96(%r8)
366 ; AVX1-NEXT: vmovaps %ymm1, 64(%r8)
367 ; AVX1-NEXT: vmovaps %ymm11, 32(%r8)
368 ; AVX1-NEXT: vzeroupper
373 ; AVX2-NEXT: vmovdqa (%rcx), %xmm8
374 ; AVX2-NEXT: vmovdqa 16(%rcx), %xmm5
375 ; AVX2-NEXT: vmovdqa (%rdx), %xmm9
376 ; AVX2-NEXT: vmovdqa 16(%rdx), %xmm6
377 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
378 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
379 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
380 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm3
381 ; AVX2-NEXT: vmovdqa (%rsi), %xmm2
382 ; AVX2-NEXT: vmovdqa 16(%rsi), %xmm7
383 ; AVX2-NEXT: vmovdqa (%rdi), %xmm4
384 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm0
385 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
386 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm1[0],zero,xmm1[1],zero
387 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
388 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1
389 ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7]
390 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
391 ; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,0,1,1]
392 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
393 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1
394 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
395 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm3[0],zero,xmm3[1],zero
396 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
397 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm10, %ymm3
398 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7]
399 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
400 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,1,1]
401 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
402 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3
403 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
404 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero
405 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
406 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0
407 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7]
408 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
409 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,1,1]
410 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
411 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3
412 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
413 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero
414 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
415 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
416 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
417 ; AVX2-NEXT: vmovdqa %ymm2, (%r8)
418 ; AVX2-NEXT: vmovdqa %ymm0, 96(%r8)
419 ; AVX2-NEXT: vmovdqa %ymm1, 64(%r8)
420 ; AVX2-NEXT: vmovdqa %ymm11, 32(%r8)
421 ; AVX2-NEXT: vzeroupper
424 ; AVX512-LABEL: vf16:
426 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0
427 ; AVX512-NEXT: vmovdqa (%rdx), %ymm1
428 ; AVX512-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
429 ; AVX512-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1
430 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,16,32,48,1,17,33,49,2,18,34,50,3,19,35,51,4,20,36,52,5,21,37,53,6,22,38,54,7,23,39,55]
431 ; AVX512-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
432 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,24,40,56,9,25,41,57,10,26,42,58,11,27,43,59,12,28,44,60,13,29,45,61,14,30,46,62,15,31,47,63]
433 ; AVX512-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
434 ; AVX512-NEXT: vmovdqu64 %zmm3, 64(%r8)
435 ; AVX512-NEXT: vmovdqu64 %zmm2, (%r8)
436 ; AVX512-NEXT: vzeroupper
438 %in.vec0 = load <16 x i16>, ptr %in.vecptr0, align 32
439 %in.vec1 = load <16 x i16>, ptr %in.vecptr1, align 32
440 %in.vec2 = load <16 x i16>, ptr %in.vecptr2, align 32
441 %in.vec3 = load <16 x i16>, ptr %in.vecptr3, align 32
443 %concat01 = shufflevector <16 x i16> %in.vec0, <16 x i16> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
444 %concat23 = shufflevector <16 x i16> %in.vec2, <16 x i16> %in.vec3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
445 %concat0123 = shufflevector <32 x i16> %concat01, <32 x i16> %concat23, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
446 %interleaved.vec = shufflevector <64 x i16> %concat0123, <64 x i16> poison, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
448 store <64 x i16> %interleaved.vec, ptr %out.vec, align 32
453 define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %out.vec) nounwind {
456 ; SSE-NEXT: movdqa (%rdi), %xmm10
457 ; SSE-NEXT: movdqa 16(%rdi), %xmm13
458 ; SSE-NEXT: movdqa 32(%rdi), %xmm8
459 ; SSE-NEXT: movdqa 48(%rdi), %xmm4
460 ; SSE-NEXT: movdqa (%rsi), %xmm3
461 ; SSE-NEXT: movdqa 16(%rsi), %xmm1
462 ; SSE-NEXT: movdqa 32(%rsi), %xmm9
463 ; SSE-NEXT: movdqa (%rdx), %xmm0
464 ; SSE-NEXT: movdqa 16(%rdx), %xmm5
465 ; SSE-NEXT: movdqa 32(%rdx), %xmm6
466 ; SSE-NEXT: movdqa (%rcx), %xmm11
467 ; SSE-NEXT: movdqa 16(%rcx), %xmm14
468 ; SSE-NEXT: movdqa 32(%rcx), %xmm12
469 ; SSE-NEXT: movdqa %xmm0, %xmm7
470 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3]
471 ; SSE-NEXT: movdqa %xmm10, %xmm15
472 ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3]
473 ; SSE-NEXT: movdqa %xmm15, %xmm2
474 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3]
475 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
476 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1]
477 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
478 ; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7]
479 ; SSE-NEXT: movdqa %xmm10, %xmm2
480 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
481 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
482 ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
483 ; SSE-NEXT: movdqa %xmm5, %xmm0
484 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
485 ; SSE-NEXT: movdqa %xmm13, %xmm7
486 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3]
487 ; SSE-NEXT: movdqa %xmm7, %xmm2
488 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
489 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
490 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
491 ; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7]
492 ; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7]
493 ; SSE-NEXT: movdqa %xmm13, %xmm11
494 ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm5[2],xmm11[3],xmm5[3]
495 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1]
496 ; SSE-NEXT: movdqa %xmm6, %xmm0
497 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3]
498 ; SSE-NEXT: movdqa %xmm8, %xmm5
499 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3]
500 ; SSE-NEXT: movdqa %xmm5, %xmm14
501 ; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm0[2],xmm14[3],xmm0[3]
502 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
503 ; SSE-NEXT: movdqa 48(%rdx), %xmm0
504 ; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7]
505 ; SSE-NEXT: movdqa 48(%rcx), %xmm12
506 ; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
507 ; SSE-NEXT: movdqa %xmm8, %xmm9
508 ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm6[2],xmm9[3],xmm6[3]
509 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
510 ; SSE-NEXT: movdqa %xmm0, %xmm6
511 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
512 ; SSE-NEXT: movdqa 48(%rsi), %xmm2
513 ; SSE-NEXT: movdqa %xmm4, %xmm3
514 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
515 ; SSE-NEXT: movdqa %xmm3, %xmm1
516 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
517 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1]
518 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
519 ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
520 ; SSE-NEXT: movdqa %xmm4, %xmm2
521 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
522 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
523 ; SSE-NEXT: movdqa %xmm4, 224(%r8)
524 ; SSE-NEXT: movdqa %xmm2, 240(%r8)
525 ; SSE-NEXT: movdqa %xmm3, 192(%r8)
526 ; SSE-NEXT: movdqa %xmm1, 208(%r8)
527 ; SSE-NEXT: movdqa %xmm8, 160(%r8)
528 ; SSE-NEXT: movdqa %xmm9, 176(%r8)
529 ; SSE-NEXT: movdqa %xmm5, 128(%r8)
530 ; SSE-NEXT: movdqa %xmm14, 144(%r8)
531 ; SSE-NEXT: movdqa %xmm13, 96(%r8)
532 ; SSE-NEXT: movdqa %xmm11, 112(%r8)
533 ; SSE-NEXT: movdqa %xmm7, 64(%r8)
534 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
535 ; SSE-NEXT: movaps %xmm0, 80(%r8)
536 ; SSE-NEXT: movdqa %xmm10, 32(%r8)
537 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
538 ; SSE-NEXT: movaps %xmm0, 48(%r8)
539 ; SSE-NEXT: movdqa %xmm15, (%r8)
540 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
541 ; SSE-NEXT: movaps %xmm0, 16(%r8)
546 ; AVX1-NEXT: vmovdqa (%rcx), %xmm12
547 ; AVX1-NEXT: vmovdqa 16(%rcx), %xmm15
548 ; AVX1-NEXT: vmovdqa 32(%rcx), %xmm3
549 ; AVX1-NEXT: vmovdqa 48(%rcx), %xmm11
550 ; AVX1-NEXT: vmovdqa (%rdx), %xmm13
551 ; AVX1-NEXT: vmovdqa 16(%rdx), %xmm6
552 ; AVX1-NEXT: vmovdqa 32(%rdx), %xmm7
553 ; AVX1-NEXT: vmovdqa 48(%rdx), %xmm1
554 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
555 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1]
556 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
557 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm8
558 ; AVX1-NEXT: vmovdqa (%rsi), %xmm14
559 ; AVX1-NEXT: vmovdqa 48(%rsi), %xmm2
560 ; AVX1-NEXT: vmovdqa (%rdi), %xmm5
561 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm4
562 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7]
563 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero
564 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
565 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm9, %ymm0
566 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7]
567 ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
568 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3]
569 ; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,0,1,1]
570 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
571 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm8
572 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
573 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm0[0],zero,xmm0[1],zero
574 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
575 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm10, %ymm0
576 ; AVX1-NEXT: vmovdqa 32(%rsi), %xmm10
577 ; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7]
578 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm0
579 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7]
580 ; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,0,1,1]
581 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
582 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1
583 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
584 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero
585 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
586 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
587 ; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
588 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
589 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
590 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
591 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
592 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
593 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero
594 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
595 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
596 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm4
597 ; AVX1-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
598 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
599 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
600 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
601 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
602 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
603 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
604 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
605 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
606 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
607 ; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
608 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3]
609 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
610 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
611 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
612 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
613 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero
614 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
615 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2
616 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7]
617 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7]
618 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,0,1,1]
619 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
620 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2
621 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
622 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero
623 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
624 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
625 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
626 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
627 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,0,1,1]
628 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
629 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
630 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3]
631 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
632 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
633 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
634 ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7]
635 ; AVX1-NEXT: vmovaps %ymm2, (%r8)
636 ; AVX1-NEXT: vmovaps %ymm1, 96(%r8)
637 ; AVX1-NEXT: vmovaps %ymm0, 64(%r8)
638 ; AVX1-NEXT: vmovaps %ymm3, 160(%r8)
639 ; AVX1-NEXT: vmovaps %ymm11, 128(%r8)
640 ; AVX1-NEXT: vmovaps %ymm8, 224(%r8)
641 ; AVX1-NEXT: vmovaps %ymm9, 192(%r8)
642 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
643 ; AVX1-NEXT: vmovaps %ymm0, 32(%r8)
644 ; AVX1-NEXT: vzeroupper
649 ; AVX2-NEXT: vmovdqa (%rcx), %xmm12
650 ; AVX2-NEXT: vmovdqa 16(%rcx), %xmm15
651 ; AVX2-NEXT: vmovdqa 32(%rcx), %xmm3
652 ; AVX2-NEXT: vmovdqa 48(%rcx), %xmm11
653 ; AVX2-NEXT: vmovdqa (%rdx), %xmm13
654 ; AVX2-NEXT: vmovdqa 16(%rdx), %xmm6
655 ; AVX2-NEXT: vmovdqa 32(%rdx), %xmm7
656 ; AVX2-NEXT: vmovdqa 48(%rdx), %xmm1
657 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
658 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1]
659 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
660 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm8
661 ; AVX2-NEXT: vmovdqa (%rsi), %xmm14
662 ; AVX2-NEXT: vmovdqa 48(%rsi), %xmm2
663 ; AVX2-NEXT: vmovdqa (%rdi), %xmm5
664 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm4
665 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7]
666 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero
667 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
668 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm0
669 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7]
670 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
671 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3]
672 ; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,0,1,1]
673 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
674 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm8
675 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
676 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm0[0],zero,xmm0[1],zero
677 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
678 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm10, %ymm0
679 ; AVX2-NEXT: vmovdqa 32(%rsi), %xmm10
680 ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7]
681 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm0
682 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7]
683 ; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,0,1,1]
684 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
685 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm1
686 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
687 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero
688 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
689 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
690 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
691 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
692 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
693 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
694 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
695 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
696 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero
697 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
698 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
699 ; AVX2-NEXT: vmovdqa 16(%rsi), %xmm4
700 ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
701 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
702 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
703 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
704 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
705 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
706 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
707 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
708 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
709 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0
710 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
711 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3]
712 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
713 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
714 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
715 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
716 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero
717 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
718 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm7, %ymm2
719 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7]
720 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7]
721 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,0,1,1]
722 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
723 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2
724 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
725 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero
726 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
727 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1
728 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
729 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
730 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,0,1,1]
731 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
732 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
733 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3]
734 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
735 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
736 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
737 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7]
738 ; AVX2-NEXT: vmovdqa %ymm2, (%r8)
739 ; AVX2-NEXT: vmovdqa %ymm1, 96(%r8)
740 ; AVX2-NEXT: vmovdqa %ymm0, 64(%r8)
741 ; AVX2-NEXT: vmovdqa %ymm3, 160(%r8)
742 ; AVX2-NEXT: vmovdqa %ymm11, 128(%r8)
743 ; AVX2-NEXT: vmovdqa %ymm8, 224(%r8)
744 ; AVX2-NEXT: vmovdqa %ymm9, 192(%r8)
745 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
746 ; AVX2-NEXT: vmovaps %ymm0, 32(%r8)
747 ; AVX2-NEXT: vzeroupper
750 ; AVX512-LABEL: vf32:
752 ; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
753 ; AVX512-NEXT: vmovdqu64 (%rsi), %zmm1
754 ; AVX512-NEXT: vmovdqu64 (%rdx), %zmm2
755 ; AVX512-NEXT: vmovdqu64 (%rcx), %zmm3
756 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = <u,u,0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39>
757 ; AVX512-NEXT: vpermi2w %zmm3, %zmm2, %zmm4
758 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39,u,u>
759 ; AVX512-NEXT: vpermi2w %zmm1, %zmm0, %zmm5
760 ; AVX512-NEXT: movw $-21846, %ax # imm = 0xAAAA
761 ; AVX512-NEXT: kmovd %eax, %k1
762 ; AVX512-NEXT: vmovdqa32 %zmm4, %zmm5 {%k1}
763 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = <u,u,8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47>
764 ; AVX512-NEXT: vpermi2w %zmm3, %zmm2, %zmm4
765 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = <8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47,u,u>
766 ; AVX512-NEXT: vpermi2w %zmm1, %zmm0, %zmm6
767 ; AVX512-NEXT: vmovdqa32 %zmm4, %zmm6 {%k1}
768 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = <u,u,16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55>
769 ; AVX512-NEXT: vpermi2w %zmm3, %zmm2, %zmm4
770 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = <16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55,u,u>
771 ; AVX512-NEXT: vpermi2w %zmm1, %zmm0, %zmm7
772 ; AVX512-NEXT: vmovdqa32 %zmm4, %zmm7 {%k1}
773 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = <u,u,24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63>
774 ; AVX512-NEXT: vpermi2w %zmm3, %zmm2, %zmm4
775 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = <24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63,u,u>
776 ; AVX512-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
777 ; AVX512-NEXT: vmovdqa32 %zmm4, %zmm2 {%k1}
778 ; AVX512-NEXT: vmovdqu64 %zmm2, 192(%r8)
779 ; AVX512-NEXT: vmovdqu64 %zmm7, 128(%r8)
780 ; AVX512-NEXT: vmovdqu64 %zmm6, 64(%r8)
781 ; AVX512-NEXT: vmovdqu64 %zmm5, (%r8)
782 ; AVX512-NEXT: vzeroupper
784 %in.vec0 = load <32 x i16>, ptr %in.vecptr0, align 32
785 %in.vec1 = load <32 x i16>, ptr %in.vecptr1, align 32
786 %in.vec2 = load <32 x i16>, ptr %in.vecptr2, align 32
787 %in.vec3 = load <32 x i16>, ptr %in.vecptr3, align 32
789 %concat01 = shufflevector <32 x i16> %in.vec0, <32 x i16> %in.vec1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
790 %concat23 = shufflevector <32 x i16> %in.vec2, <32 x i16> %in.vec3, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
791 %concat0123 = shufflevector <64 x i16> %concat01, <64 x i16> %concat23, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
792 %interleaved.vec = shufflevector <128 x i16> %concat0123, <128 x i16> poison, <128 x i32> <i32 0, i32 32, i32 64, i32 96, i32 1, i32 33, i32 65, i32 97, i32 2, i32 34, i32 66, i32 98, i32 3, i32 35, i32 67, i32 99, i32 4, i32 36, i32 68, i32 100, i32 5, i32 37, i32 69, i32 101, i32 6, i32 38, i32 70, i32 102, i32 7, i32 39, i32 71, i32 103, i32 8, i32 40, i32 72, i32 104, i32 9, i32 41, i32 73, i32 105, i32 10, i32 42, i32 74, i32 106, i32 11, i32 43, i32 75, i32 107, i32 12, i32 44, i32 76, i32 108, i32 13, i32 45, i32 77, i32 109, i32 14, i32 46, i32 78, i32 110, i32 15, i32 47, i32 79, i32 111, i32 16, i32 48, i32 80, i32 112, i32 17, i32 49, i32 81, i32 113, i32 18, i32 50, i32 82, i32 114, i32 19, i32 51, i32 83, i32 115, i32 20, i32 52, i32 84, i32 116, i32 21, i32 53, i32 85, i32 117, i32 22, i32 54, i32 86, i32 118, i32 23, i32 55, i32 87, i32 119, i32 24, i32 56, i32 88, i32 120, i32 25, i32 57, i32 89, i32 121, i32 26, i32 58, i32 90, i32 122, i32 27, i32 59, i32 91, i32 123, i32 28, i32 60, i32 92, i32 124, i32 29, i32 61, i32 93, i32 125, i32 30, i32 62, i32 94, i32 126, i32 31, i32 63, i32 95, i32 127>
794 store <128 x i16> %interleaved.vec, ptr %out.vec, align 32