1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX1
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2-SLOW
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FAST
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FAST
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512
9 ; These patterns are produced by LoopVectorizer for interleaved stores.
11 define void @vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind {
14 ; SSE-NEXT: movdqa (%rdi), %xmm0
15 ; SSE-NEXT: movdqa (%rdx), %xmm1
16 ; SSE-NEXT: movdqa (%r8), %xmm2
17 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
18 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
19 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
20 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,1,3,4,5,6,7]
21 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4]
22 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
23 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
24 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,7,5]
25 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,65535,65535]
26 ; SSE-NEXT: pand %xmm3, %xmm1
27 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,0,1]
28 ; SSE-NEXT: pandn %xmm4, %xmm3
29 ; SSE-NEXT: por %xmm1, %xmm3
30 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
31 ; SSE-NEXT: psrld $16, %xmm2
32 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
33 ; SSE-NEXT: movd %xmm0, 16(%r9)
34 ; SSE-NEXT: movdqa %xmm3, (%r9)
39 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
40 ; AVX1-NEXT: vmovdqa (%rdx), %xmm1
41 ; AVX1-NEXT: vmovdqa (%r8), %xmm2
42 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
43 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
44 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
45 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,2,3,6,7,10,11]
46 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
47 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7]
48 ; AVX1-NEXT: vpsrlq $48, %xmm1, %xmm1
49 ; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2
50 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
51 ; AVX1-NEXT: vmovd %xmm1, 16(%r9)
52 ; AVX1-NEXT: vmovdqa %xmm0, (%r9)
55 ; AVX2-SLOW-LABEL: vf2:
57 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0
58 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm1
59 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
60 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
61 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
62 ; AVX2-SLOW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
63 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,u,u,2,3,6,7,10,11,u,u,18,19,u,u,u,u,u,u,u,u,u,u,u,u]
64 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
65 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,0,3,4,7,4,7]
66 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15]
67 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
68 ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
69 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
70 ; AVX2-SLOW-NEXT: vmovd %xmm1, 16(%r9)
71 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%r9)
72 ; AVX2-SLOW-NEXT: vzeroupper
73 ; AVX2-SLOW-NEXT: retq
75 ; AVX2-FAST-LABEL: vf2:
77 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
78 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm1
79 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
80 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
81 ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
82 ; AVX2-FAST-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
83 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,ymm0[2,3,6,7,10,11],zero,zero,ymm0[18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
84 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
85 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[30,31],zero,zero,ymm0[30,31,30,31,16,17,18,19,28,29,30,31]
86 ; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0
87 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
88 ; AVX2-FAST-NEXT: vmovd %xmm1, 16(%r9)
89 ; AVX2-FAST-NEXT: vmovdqa %xmm0, (%r9)
90 ; AVX2-FAST-NEXT: vzeroupper
91 ; AVX2-FAST-NEXT: retq
95 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
96 ; AVX512-NEXT: vmovdqa (%rdx), %xmm1
97 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
98 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
99 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
100 ; AVX512-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
101 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,8,1,3,5,7,9,u,u,u,u,u,u>
102 ; AVX512-NEXT: vpermw %ymm0, %ymm1, %ymm0
103 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
104 ; AVX512-NEXT: vmovd %xmm1, 16(%r9)
105 ; AVX512-NEXT: vmovdqa %xmm0, (%r9)
106 ; AVX512-NEXT: vzeroupper
108 %in.vec0 = load <2 x i16>, ptr %in.vecptr0, align 32
109 %in.vec1 = load <2 x i16>, ptr %in.vecptr1, align 32
110 %in.vec2 = load <2 x i16>, ptr %in.vecptr2, align 32
111 %in.vec3 = load <2 x i16>, ptr %in.vecptr3, align 32
112 %in.vec4 = load <2 x i16>, ptr %in.vecptr4, align 32
114 %concat01 = shufflevector <2 x i16> %in.vec0, <2 x i16> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
115 %concat23 = shufflevector <2 x i16> %in.vec2, <2 x i16> %in.vec3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
116 %concat0123 = shufflevector <4 x i16> %concat01, <4 x i16> %concat23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
117 %concat4uuu = shufflevector <2 x i16> %in.vec4, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
118 %concat01234 = shufflevector <8 x i16> %concat0123, <8 x i16> %concat4uuu, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
119 %interleaved.vec = shufflevector <10 x i16> %concat01234, <10 x i16> poison, <10 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 1, i32 3, i32 5, i32 7, i32 9>
121 store <10 x i16> %interleaved.vec, ptr %out.vec, align 32
126 define void @vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind {
129 ; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
130 ; SSE-NEXT: movq {{.*#+}} xmm5 = mem[0],zero
131 ; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
132 ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
133 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
134 ; SSE-NEXT: movdqa %xmm1, %xmm4
135 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
136 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
137 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
138 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[3,3,3,3]
139 ; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
140 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,0,65535,65535,65535]
141 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
142 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,1,3,4,5,6,7]
143 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,6,7]
144 ; SSE-NEXT: pand %xmm7, %xmm4
145 ; SSE-NEXT: pandn %xmm5, %xmm7
146 ; SSE-NEXT: por %xmm4, %xmm7
147 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,65535,65535,0,65535]
148 ; SSE-NEXT: pand %xmm4, %xmm7
149 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,0,1]
150 ; SSE-NEXT: pandn %xmm5, %xmm4
151 ; SSE-NEXT: por %xmm7, %xmm4
152 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
153 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7]
154 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1]
155 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,0,0,65535,65535,65535,0]
156 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7]
157 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,1]
158 ; SSE-NEXT: pand %xmm7, %xmm2
159 ; SSE-NEXT: pandn %xmm3, %xmm7
160 ; SSE-NEXT: por %xmm2, %xmm7
161 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,65535,65535]
162 ; SSE-NEXT: pand %xmm2, %xmm7
163 ; SSE-NEXT: pandn %xmm5, %xmm2
164 ; SSE-NEXT: por %xmm7, %xmm2
165 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,65535]
166 ; SSE-NEXT: psrlq $48, %xmm1
167 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
168 ; SSE-NEXT: pand %xmm3, %xmm6
169 ; SSE-NEXT: pandn %xmm0, %xmm3
170 ; SSE-NEXT: por %xmm6, %xmm3
171 ; SSE-NEXT: movq %xmm3, 32(%r9)
172 ; SSE-NEXT: movdqa %xmm2, (%r9)
173 ; SSE-NEXT: movdqa %xmm4, 16(%r9)
178 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
179 ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
180 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
181 ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
182 ; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
183 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
184 ; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
185 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
186 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[3,1,2,3]
187 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7]
188 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3,4,5,6,7]
189 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7]
190 ; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[10,11,10,11,u,u,u,u,4,5,12,13,u,u,u,u]
191 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[3,1,2,1]
192 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,0,4,5,6,7]
193 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
194 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
195 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2,3,4,5],xmm2[6],xmm4[7]
196 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3]
197 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u]
198 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7]
199 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5,6,7]
200 ; AVX1-NEXT: vmovdqa %xmm0, (%r9)
201 ; AVX1-NEXT: vmovdqa %xmm4, 16(%r9)
202 ; AVX1-NEXT: vmovq %xmm3, 32(%r9)
205 ; AVX2-SLOW-LABEL: vf4:
206 ; AVX2-SLOW: # %bb.0:
207 ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
208 ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
209 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
210 ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
211 ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
212 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
213 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
214 ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
215 ; AVX2-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3
216 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u,26,27,u,u,u,u,u,u,20,21,28,29,u,u,u,u]
217 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
218 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23]
219 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6],ymm2[7],ymm4[8,9],ymm2[10,11],ymm4[12,13,14],ymm2[15]
220 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255]
221 ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
222 ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
223 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
224 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7]
225 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
226 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7]
227 ; AVX2-SLOW-NEXT: vmovq %xmm0, 32(%r9)
228 ; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r9)
229 ; AVX2-SLOW-NEXT: vzeroupper
230 ; AVX2-SLOW-NEXT: retq
232 ; AVX2-FAST-LABEL: vf4:
233 ; AVX2-FAST: # %bb.0:
234 ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
235 ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
236 ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
237 ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
238 ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
239 ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
240 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
241 ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
242 ; AVX2-FAST-NEXT: vpbroadcastq %xmm3, %ymm3
243 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u,26,27,u,u,u,u,u,u,20,21,28,29,u,u,u,u]
244 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
245 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23]
246 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6],ymm2[7],ymm4[8,9],ymm2[10,11],ymm4[12,13,14],ymm2[15]
247 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255]
248 ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
249 ; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
250 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u]
251 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
252 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7]
253 ; AVX2-FAST-NEXT: vmovq %xmm0, 32(%r9)
254 ; AVX2-FAST-NEXT: vmovdqa %ymm2, (%r9)
255 ; AVX2-FAST-NEXT: vzeroupper
256 ; AVX2-FAST-NEXT: retq
260 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
261 ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
262 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
263 ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
264 ; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
265 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
266 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
267 ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
268 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
269 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3,7,11,15,19,u,u,u,u,u,u,u,u,u,u,u,u>
270 ; AVX512-NEXT: vpermw %zmm0, %zmm1, %zmm0
271 ; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm1
272 ; AVX512-NEXT: vmovq %xmm1, 32(%r9)
273 ; AVX512-NEXT: vmovdqa %ymm0, (%r9)
274 ; AVX512-NEXT: vzeroupper
276 %in.vec0 = load <4 x i16>, ptr %in.vecptr0, align 32
277 %in.vec1 = load <4 x i16>, ptr %in.vecptr1, align 32
278 %in.vec2 = load <4 x i16>, ptr %in.vecptr2, align 32
279 %in.vec3 = load <4 x i16>, ptr %in.vecptr3, align 32
280 %in.vec4 = load <4 x i16>, ptr %in.vecptr4, align 32
282 %concat01 = shufflevector <4 x i16> %in.vec0, <4 x i16> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
283 %concat23 = shufflevector <4 x i16> %in.vec2, <4 x i16> %in.vec3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
284 %concat0123 = shufflevector <8 x i16> %concat01, <8 x i16> %concat23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
285 %concat4uuu = shufflevector <4 x i16> %in.vec4, <4 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
286 %concat01234 = shufflevector <16 x i16> %concat0123, <16 x i16> %concat4uuu, <20 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
287 %interleaved.vec = shufflevector <20 x i16> %concat01234, <20 x i16> poison, <20 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 1, i32 5, i32 9, i32 13, i32 17, i32 2, i32 6, i32 10, i32 14, i32 18, i32 3, i32 7, i32 11, i32 15, i32 19>
289 store <20 x i16> %interleaved.vec, ptr %out.vec, align 32
294 define void @vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind {
297 ; SSE-NEXT: movdqa (%rdi), %xmm13
298 ; SSE-NEXT: movdqa (%rsi), %xmm11
299 ; SSE-NEXT: movdqa (%rdx), %xmm10
300 ; SSE-NEXT: movdqa (%rcx), %xmm12
301 ; SSE-NEXT: movdqa (%r8), %xmm8
302 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535]
303 ; SSE-NEXT: movdqa %xmm0, %xmm1
304 ; SSE-NEXT: pandn %xmm13, %xmm1
305 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm11[3,3,3,3,4,5,6,7]
306 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4]
307 ; SSE-NEXT: pand %xmm0, %xmm4
308 ; SSE-NEXT: por %xmm1, %xmm4
309 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,0,0,65535,65535]
310 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,2,2]
311 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,65535,65535,0]
312 ; SSE-NEXT: pand %xmm3, %xmm5
313 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm12[3,3,3,3,4,5,6,7]
314 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4]
315 ; SSE-NEXT: movdqa %xmm3, %xmm1
316 ; SSE-NEXT: pandn %xmm7, %xmm1
317 ; SSE-NEXT: por %xmm5, %xmm1
318 ; SSE-NEXT: pand %xmm2, %xmm1
319 ; SSE-NEXT: pandn %xmm4, %xmm2
320 ; SSE-NEXT: por %xmm1, %xmm2
321 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535]
322 ; SSE-NEXT: pand %xmm5, %xmm2
323 ; SSE-NEXT: pandn %xmm8, %xmm5
324 ; SSE-NEXT: por %xmm2, %xmm5
325 ; SSE-NEXT: movdqa %xmm10, %xmm1
326 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3]
327 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7]
328 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1]
329 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,0,65535,65535,65535,0]
330 ; SSE-NEXT: movdqa %xmm13, %xmm4
331 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3]
332 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[0,1,3,2,4,5,6,7]
333 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,1]
334 ; SSE-NEXT: pand %xmm2, %xmm7
335 ; SSE-NEXT: pandn %xmm1, %xmm2
336 ; SSE-NEXT: por %xmm7, %xmm2
337 ; SSE-NEXT: pand %xmm0, %xmm2
338 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1]
339 ; SSE-NEXT: pandn %xmm9, %xmm0
340 ; SSE-NEXT: por %xmm2, %xmm0
341 ; SSE-NEXT: movdqa %xmm13, %xmm1
342 ; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7]
343 ; SSE-NEXT: psrlq $48, %xmm11
344 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm11[1]
345 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,65535,65535,65535]
346 ; SSE-NEXT: movdqa %xmm2, %xmm7
347 ; SSE-NEXT: pandn %xmm1, %xmm7
348 ; SSE-NEXT: movdqa %xmm10, %xmm1
349 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7]
350 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,7,6]
351 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3]
352 ; SSE-NEXT: pand %xmm2, %xmm6
353 ; SSE-NEXT: por %xmm7, %xmm6
354 ; SSE-NEXT: pand %xmm3, %xmm6
355 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,3,2,3]
356 ; SSE-NEXT: pandn %xmm7, %xmm3
357 ; SSE-NEXT: por %xmm6, %xmm3
358 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
359 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm13[0,2,3,3,4,5,6,7]
360 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,2]
361 ; SSE-NEXT: pand %xmm2, %xmm6
362 ; SSE-NEXT: pandn %xmm1, %xmm2
363 ; SSE-NEXT: por %xmm6, %xmm2
364 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,0,65535,65535]
365 ; SSE-NEXT: pand %xmm1, %xmm2
366 ; SSE-NEXT: pandn %xmm7, %xmm1
367 ; SSE-NEXT: por %xmm2, %xmm1
368 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5,6,6]
369 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
370 ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
371 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[2,2,2,2,4,5,6,7]
372 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,6,7]
373 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
374 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
375 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,0,65535]
376 ; SSE-NEXT: pand %xmm2, %xmm4
377 ; SSE-NEXT: pandn %xmm9, %xmm2
378 ; SSE-NEXT: por %xmm4, %xmm2
379 ; SSE-NEXT: movdqa %xmm2, 16(%r9)
380 ; SSE-NEXT: movdqa %xmm1, 48(%r9)
381 ; SSE-NEXT: movdqa %xmm3, 64(%r9)
382 ; SSE-NEXT: movdqa %xmm0, (%r9)
383 ; SSE-NEXT: movdqa %xmm5, 32(%r9)
388 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
389 ; AVX1-NEXT: vmovdqa (%rsi), %xmm3
390 ; AVX1-NEXT: vmovdqa (%rdx), %xmm4
391 ; AVX1-NEXT: vmovdqa (%rcx), %xmm5
392 ; AVX1-NEXT: vmovdqa (%r8), %xmm1
393 ; AVX1-NEXT: vpsrlq $48, %xmm3, %xmm2
394 ; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm2[1]
395 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
396 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm9[0,1,2,3,4,5,7,6]
397 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,3,3]
398 ; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm7[0,1,2],xmm2[3,4],xmm7[5,6,7]
399 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
400 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,2,2,4,5,6,7]
401 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,0,2,1]
402 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
403 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm2[0,1,3,2,4,5,6,7]
404 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,1,1]
405 ; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3],xmm6[4,5,6],xmm7[7]
406 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,1,0,1]
407 ; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm6[0,1,2,3],xmm7[4],xmm6[5,6,7]
408 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,6]
409 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
410 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
411 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,2,2,2,4,5,6,7]
412 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,6,7]
413 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3],xmm6[4,5],xmm2[6,7]
414 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm7[1],xmm2[2,3,4,5],xmm7[6],xmm2[7]
415 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[3,3,3,3,4,5,6,7]
416 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4]
417 ; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm0[4],xmm6[5,6,7]
418 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2]
419 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,3,3,3,4,5,6,7]
420 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4]
421 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4,5,6],xmm5[7]
422 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1,2,3],xmm6[4,5],xmm4[6,7]
423 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm1[3],xmm4[4,5,6,7]
424 ; AVX1-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
425 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
426 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7]
427 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
428 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3,4],xmm0[5,6,7]
429 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
430 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5],xmm0[6,7]
431 ; AVX1-NEXT: vmovdqa %xmm0, 48(%r9)
432 ; AVX1-NEXT: vmovdqa %xmm4, 32(%r9)
433 ; AVX1-NEXT: vmovdqa %xmm2, 16(%r9)
434 ; AVX1-NEXT: vmovdqa %xmm10, (%r9)
435 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm1[2],xmm8[3,4,5,6],xmm1[7]
436 ; AVX1-NEXT: vmovdqa %xmm0, 64(%r9)
439 ; AVX2-SLOW-LABEL: vf8:
440 ; AVX2-SLOW: # %bb.0:
441 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0
442 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm2
443 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm1
444 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm3
445 ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm4
446 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm5
447 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm6
448 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm5[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15]
449 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,3,2,4,6,7,6]
450 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1]
451 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[6,7,10,11,u,u,6,7,u,u,8,9,12,13,u,u,22,23,26,27,u,u,22,23,u,u,24,25,28,29,u,u]
452 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3],ymm7[4],ymm8[5,6],ymm7[7],ymm8[8,9],ymm7[10],ymm8[11],ymm7[12],ymm8[13,14],ymm7[15]
453 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1]
454 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,6,7,10,11,u,u,u,u,u,u,8,9,u,u,u,u,22,23,26,27,u,u,u,u,u,u,24,25]
455 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm6[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15]
456 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,5,5,4,7,8,9,10,11,13,13,12,15]
457 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6],ymm8[7],ymm9[8,9],ymm8[10,11],ymm9[12,13,14],ymm8[15]
458 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255>
459 ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
460 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1]
461 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
462 ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm7, %ymm4, %ymm4
463 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0]
464 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm6[0,1,8,9,12,13],zero,zero,zero,zero,ymm6[2,3,18,19,18,19],zero,zero,zero,zero,ymm6[28,29,20,21,28,29],zero,zero
465 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,0,2]
466 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,8,9],zero,zero,zero,zero,zero,zero,ymm5[2,3,10,11],zero,zero,zero,zero,zero,zero,ymm5[20,21,28,29],zero,zero,zero,zero,zero,zero,ymm5[22,23]
467 ; AVX2-SLOW-NEXT: vpor %ymm6, %ymm5, %ymm5
468 ; AVX2-SLOW-NEXT: vpbroadcastq (%r8), %ymm6
469 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255]
470 ; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
471 ; AVX2-SLOW-NEXT: vpsrlq $48, %xmm2, %xmm2
472 ; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1]
473 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
474 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,6]
475 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
476 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
477 ; AVX2-SLOW-NEXT: vpbroadcastd 12(%r8), %xmm1
478 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6],xmm1[7]
479 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, 64(%r9)
480 ; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%r9)
481 ; AVX2-SLOW-NEXT: vmovdqa %ymm4, 32(%r9)
482 ; AVX2-SLOW-NEXT: vzeroupper
483 ; AVX2-SLOW-NEXT: retq
487 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
488 ; AVX512-NEXT: vmovdqa (%rdx), %xmm1
489 ; AVX512-NEXT: vmovdqa (%r8), %xmm2
490 ; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
491 ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
492 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
493 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [22,30,38,7,15,23,31,39]
494 ; AVX512-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
495 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,24,32,1,9,17,25,33,2,10,18,26,34,3,11,19,27,35,4,12,20,28,36,5,13,21,29,37,6,14]
496 ; AVX512-NEXT: vpermi2w %zmm2, %zmm0, %zmm3
497 ; AVX512-NEXT: vmovdqu64 %zmm3, (%r9)
498 ; AVX512-NEXT: vmovdqa %xmm1, 64(%r9)
499 ; AVX512-NEXT: vzeroupper
501 %in.vec0 = load <8 x i16>, ptr %in.vecptr0, align 32
502 %in.vec1 = load <8 x i16>, ptr %in.vecptr1, align 32
503 %in.vec2 = load <8 x i16>, ptr %in.vecptr2, align 32
504 %in.vec3 = load <8 x i16>, ptr %in.vecptr3, align 32
505 %in.vec4 = load <8 x i16>, ptr %in.vecptr4, align 32
507 %concat01 = shufflevector <8 x i16> %in.vec0, <8 x i16> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
508 %concat23 = shufflevector <8 x i16> %in.vec2, <8 x i16> %in.vec3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
509 %concat0123 = shufflevector <16 x i16> %concat01, <16 x i16> %concat23, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
510 %concat4uuu = shufflevector <8 x i16> %in.vec4, <8 x i16> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
511 %concat01234 = shufflevector <32 x i16> %concat0123, <32 x i16> %concat4uuu, <40 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
512 %interleaved.vec = shufflevector <40 x i16> %concat01234, <40 x i16> poison, <40 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 1, i32 9, i32 17, i32 25, i32 33, i32 2, i32 10, i32 18, i32 26, i32 34, i32 3, i32 11, i32 19, i32 27, i32 35, i32 4, i32 12, i32 20, i32 28, i32 36, i32 5, i32 13, i32 21, i32 29, i32 37, i32 6, i32 14, i32 22, i32 30, i32 38, i32 7, i32 15, i32 23, i32 31, i32 39>
514 store <40 x i16> %interleaved.vec, ptr %out.vec, align 32
519 define void @vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind {
522 ; SSE-NEXT: pushq %rax
523 ; SSE-NEXT: movdqa (%rdi), %xmm3
524 ; SSE-NEXT: movdqa 16(%rdi), %xmm5
525 ; SSE-NEXT: movdqa (%rsi), %xmm6
526 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
527 ; SSE-NEXT: movdqa 16(%rsi), %xmm8
528 ; SSE-NEXT: movdqa 16(%rdx), %xmm10
529 ; SSE-NEXT: movdqa (%rcx), %xmm12
530 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
531 ; SSE-NEXT: movdqa 16(%rcx), %xmm13
532 ; SSE-NEXT: movdqa 16(%r8), %xmm15
533 ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,0,65535,65535,65535]
534 ; SSE-NEXT: movdqa %xmm11, %xmm1
535 ; SSE-NEXT: pandn %xmm5, %xmm1
536 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[3,3,3,3,4,5,6,7]
537 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4]
538 ; SSE-NEXT: pand %xmm11, %xmm2
539 ; SSE-NEXT: por %xmm1, %xmm2
540 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,0,0,65535,65535]
541 ; SSE-NEXT: movdqa %xmm1, %xmm7
542 ; SSE-NEXT: pandn %xmm2, %xmm7
543 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,2,2]
544 ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,0,65535,65535,65535,65535,0]
545 ; SSE-NEXT: pand %xmm14, %xmm0
546 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm13[3,3,3,3,4,5,6,7]
547 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm4[0,1,2,3,4,4,4,4]
548 ; SSE-NEXT: movdqa %xmm14, %xmm4
549 ; SSE-NEXT: pandn %xmm9, %xmm4
550 ; SSE-NEXT: por %xmm0, %xmm4
551 ; SSE-NEXT: pand %xmm1, %xmm4
552 ; SSE-NEXT: por %xmm7, %xmm4
553 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535]
554 ; SSE-NEXT: pand %xmm2, %xmm4
555 ; SSE-NEXT: movdqa %xmm2, %xmm0
556 ; SSE-NEXT: pandn %xmm15, %xmm0
557 ; SSE-NEXT: por %xmm4, %xmm0
558 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
559 ; SSE-NEXT: movdqa %xmm11, %xmm0
560 ; SSE-NEXT: pandn %xmm3, %xmm0
561 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[3,3,3,3,4,5,6,7]
562 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4]
563 ; SSE-NEXT: pand %xmm11, %xmm4
564 ; SSE-NEXT: por %xmm0, %xmm4
565 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[3,3,3,3,4,5,6,7]
566 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
567 ; SSE-NEXT: movdqa %xmm14, %xmm7
568 ; SSE-NEXT: pandn %xmm0, %xmm7
569 ; SSE-NEXT: movdqa (%rdx), %xmm0
570 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
571 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
572 ; SSE-NEXT: pand %xmm14, %xmm0
573 ; SSE-NEXT: por %xmm0, %xmm7
574 ; SSE-NEXT: pand %xmm1, %xmm7
575 ; SSE-NEXT: pandn %xmm4, %xmm1
576 ; SSE-NEXT: movdqa (%r8), %xmm0
577 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
578 ; SSE-NEXT: por %xmm7, %xmm1
579 ; SSE-NEXT: pand %xmm2, %xmm1
580 ; SSE-NEXT: pandn %xmm0, %xmm2
581 ; SSE-NEXT: por %xmm1, %xmm2
582 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
583 ; SSE-NEXT: movdqa %xmm5, %xmm1
584 ; SSE-NEXT: movdqa %xmm5, %xmm12
585 ; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7]
586 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3]
587 ; SSE-NEXT: psrlq $48, %xmm8
588 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm8[1]
589 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,65535,65535,65535]
590 ; SSE-NEXT: movdqa %xmm0, %xmm7
591 ; SSE-NEXT: pandn %xmm1, %xmm7
592 ; SSE-NEXT: movdqa %xmm10, %xmm1
593 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7]
594 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,7,6]
595 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3]
596 ; SSE-NEXT: pand %xmm0, %xmm6
597 ; SSE-NEXT: por %xmm7, %xmm6
598 ; SSE-NEXT: pand %xmm14, %xmm6
599 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[2,3,2,3]
600 ; SSE-NEXT: movdqa %xmm14, %xmm2
601 ; SSE-NEXT: pandn %xmm4, %xmm2
602 ; SSE-NEXT: por %xmm6, %xmm2
603 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
604 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
605 ; SSE-NEXT: movdqa %xmm0, %xmm7
606 ; SSE-NEXT: pandn %xmm1, %xmm7
607 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,2,3,3,4,5,6,7]
608 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,2,2]
609 ; SSE-NEXT: pand %xmm0, %xmm6
610 ; SSE-NEXT: por %xmm7, %xmm6
611 ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,65535,65535,65535,65535,0,65535,65535]
612 ; SSE-NEXT: movdqa %xmm9, %xmm1
613 ; SSE-NEXT: pandn %xmm4, %xmm1
614 ; SSE-NEXT: pand %xmm9, %xmm6
615 ; SSE-NEXT: por %xmm6, %xmm1
616 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
617 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5,6,6]
618 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
619 ; SSE-NEXT: movdqa %xmm13, %xmm6
620 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3]
621 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,2,2,2,4,5,6,7]
622 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,6,7]
623 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
624 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
625 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,65535,65535,0,65535]
626 ; SSE-NEXT: pand %xmm12, %xmm6
627 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,1,0,1]
628 ; SSE-NEXT: movdqa %xmm12, %xmm15
629 ; SSE-NEXT: pandn %xmm1, %xmm15
630 ; SSE-NEXT: por %xmm6, %xmm15
631 ; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3]
632 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[0,1,2,2,4,5,6,7]
633 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,1]
634 ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,0,0,65535,65535,65535,0]
635 ; SSE-NEXT: movdqa %xmm13, %xmm6
636 ; SSE-NEXT: pandn %xmm10, %xmm6
637 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,2,4,5,6,7]
638 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,1,1]
639 ; SSE-NEXT: pand %xmm13, %xmm5
640 ; SSE-NEXT: por %xmm6, %xmm5
641 ; SSE-NEXT: movdqa %xmm11, %xmm10
642 ; SSE-NEXT: pandn %xmm1, %xmm10
643 ; SSE-NEXT: pand %xmm11, %xmm5
644 ; SSE-NEXT: por %xmm5, %xmm10
645 ; SSE-NEXT: movdqa %xmm3, %xmm8
646 ; SSE-NEXT: movdqa %xmm3, %xmm1
647 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
648 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
649 ; SSE-NEXT: movdqa %xmm3, %xmm5
650 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
651 ; SSE-NEXT: psrlq $48, %xmm4
652 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm4[1]
653 ; SSE-NEXT: movdqa %xmm0, %xmm4
654 ; SSE-NEXT: pandn %xmm5, %xmm4
655 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
656 ; SSE-NEXT: movdqa %xmm6, %xmm5
657 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
658 ; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
659 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,5,7,6]
660 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3]
661 ; SSE-NEXT: pand %xmm0, %xmm3
662 ; SSE-NEXT: por %xmm4, %xmm3
663 ; SSE-NEXT: pand %xmm14, %xmm3
664 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
665 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,3,2,3]
666 ; SSE-NEXT: pandn %xmm4, %xmm14
667 ; SSE-NEXT: por %xmm3, %xmm14
668 ; SSE-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
669 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7]
670 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2]
671 ; SSE-NEXT: pand %xmm0, %xmm1
672 ; SSE-NEXT: pandn %xmm5, %xmm0
673 ; SSE-NEXT: por %xmm1, %xmm0
674 ; SSE-NEXT: pand %xmm9, %xmm0
675 ; SSE-NEXT: pandn %xmm4, %xmm9
676 ; SSE-NEXT: por %xmm0, %xmm9
677 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,5,6,6]
678 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
679 ; SSE-NEXT: movdqa %xmm2, %xmm1
680 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
681 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7]
682 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7]
683 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
684 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
685 ; SSE-NEXT: pand %xmm12, %xmm1
686 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,1]
687 ; SSE-NEXT: pandn %xmm0, %xmm12
688 ; SSE-NEXT: por %xmm1, %xmm12
689 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
690 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,1,2,2,4,5,6,7]
691 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1]
692 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[0,1,3,2,4,5,6,7]
693 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,1]
694 ; SSE-NEXT: pand %xmm13, %xmm3
695 ; SSE-NEXT: pandn %xmm1, %xmm13
696 ; SSE-NEXT: por %xmm3, %xmm13
697 ; SSE-NEXT: pand %xmm11, %xmm13
698 ; SSE-NEXT: pandn %xmm0, %xmm11
699 ; SSE-NEXT: por %xmm13, %xmm11
700 ; SSE-NEXT: movdqa %xmm11, (%r9)
701 ; SSE-NEXT: movdqa %xmm12, 16(%r9)
702 ; SSE-NEXT: movdqa %xmm9, 48(%r9)
703 ; SSE-NEXT: movdqa %xmm14, 64(%r9)
704 ; SSE-NEXT: movdqa %xmm10, 80(%r9)
705 ; SSE-NEXT: movdqa %xmm15, 96(%r9)
706 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
707 ; SSE-NEXT: movaps %xmm0, 128(%r9)
708 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
709 ; SSE-NEXT: movaps %xmm0, 144(%r9)
710 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
711 ; SSE-NEXT: movaps %xmm0, 32(%r9)
712 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
713 ; SSE-NEXT: movaps %xmm0, 112(%r9)
714 ; SSE-NEXT: popq %rax
719 ; AVX1-NEXT: vmovdqa (%rcx), %xmm13
720 ; AVX1-NEXT: vmovdqa 16(%rcx), %xmm7
721 ; AVX1-NEXT: vmovdqa (%rdx), %xmm9
722 ; AVX1-NEXT: vmovdqa 16(%rdx), %xmm3
723 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
724 ; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
725 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6]
726 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
727 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
728 ; AVX1-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535]
729 ; AVX1-NEXT: vandnps %ymm0, %ymm10, %ymm6
730 ; AVX1-NEXT: vmovdqa (%rdi), %xmm14
731 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm0
732 ; AVX1-NEXT: vmovdqa (%rsi), %xmm15
733 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1
734 ; AVX1-NEXT: vpsrlq $48, %xmm1, %xmm2
735 ; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm2[1]
736 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
737 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7]
738 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,2]
739 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
740 ; AVX1-NEXT: vandps %ymm2, %ymm10, %ymm2
741 ; AVX1-NEXT: vorps %ymm6, %ymm2, %ymm2
742 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm8
743 ; AVX1-NEXT: vmovdqa 16(%r8), %xmm11
744 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[2,3,2,3]
745 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm4[2],xmm8[3,4,5,6],xmm4[7]
746 ; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
747 ; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm4[0],xmm2[1,2,3,4],xmm4[5],xmm2[6,7]
748 ; AVX1-NEXT: vpsrlq $48, %xmm15, %xmm2
749 ; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm14[1],xmm2[1]
750 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
751 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,3,2,4,5,6,7]
752 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,1,1]
753 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
754 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7]
755 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,4,5,7,6]
756 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,3,3]
757 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
758 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,2,2,4,5,6,7]
759 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,2,1]
760 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
761 ; AVX1-NEXT: vandnps %ymm2, %ymm10, %ymm2
762 ; AVX1-NEXT: vandps %ymm5, %ymm10, %ymm5
763 ; AVX1-NEXT: vorps %ymm2, %ymm5, %ymm2
764 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
765 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7]
766 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
767 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7]
768 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,5,6,6]
769 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
770 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
771 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[1,1,2,2]
772 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[3,3,3,3,4,5,6,7]
773 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4]
774 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4,5,6],xmm4[7]
775 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,1,0,1]
776 ; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm5[0,1,2,3],xmm4[4],xmm5[5,6,7]
777 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
778 ; AVX1-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535]
779 ; AVX1-NEXT: vandnps %ymm0, %ymm5, %ymm0
780 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7]
781 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,6,7]
782 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
783 ; AVX1-NEXT: vandps %ymm5, %ymm1, %ymm1
784 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
785 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
786 ; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0,1,2],xmm11[3],xmm1[4,5,6,7]
787 ; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7]
788 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3]
789 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7]
790 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
791 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3]
792 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
793 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7]
794 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
795 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
796 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,1,3,2,4,5,6,7]
797 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,1,1]
798 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6]
799 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
800 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
801 ; AVX1-NEXT: vmovdqa (%r8), %xmm4
802 ; AVX1-NEXT: vandnps %ymm0, %ymm5, %ymm0
803 ; AVX1-NEXT: vandps %ymm5, %ymm1, %ymm1
804 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
805 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
806 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,1,0,1]
807 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5],xmm5[6],xmm1[7]
808 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5,6,7]
809 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[1,1,2,2]
810 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[3,3,3,3,4,5,6,7]
811 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4]
812 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4,5,6],xmm6[7]
813 ; AVX1-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
814 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
815 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm15[3,3,3,3,4,5,6,7]
816 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4]
817 ; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm14[4],xmm6[5,6,7]
818 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7]
819 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7]
820 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,2]
821 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3
822 ; AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535]
823 ; AVX1-NEXT: vandnps %ymm5, %ymm6, %ymm5
824 ; AVX1-NEXT: vandps %ymm6, %ymm3, %ymm3
825 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,3,2,3]
826 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3,4,5,6],xmm6[7]
827 ; AVX1-NEXT: vorps %ymm5, %ymm3, %ymm3
828 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
829 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2,3,4],xmm6[5],xmm5[6,7]
830 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7]
831 ; AVX1-NEXT: vmovdqa %xmm3, 32(%r9)
832 ; AVX1-NEXT: vmovdqa %xmm5, 48(%r9)
833 ; AVX1-NEXT: vmovdqa %xmm0, (%r9)
834 ; AVX1-NEXT: vmovdqa %xmm1, 16(%r9)
835 ; AVX1-NEXT: vmovdqa %xmm7, 96(%r9)
836 ; AVX1-NEXT: vmovdqa %xmm11, 112(%r9)
837 ; AVX1-NEXT: vmovdqa %xmm2, 64(%r9)
838 ; AVX1-NEXT: vmovdqa %xmm10, 80(%r9)
839 ; AVX1-NEXT: vmovdqa %xmm8, 128(%r9)
840 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
841 ; AVX1-NEXT: vmovaps %xmm0, 144(%r9)
842 ; AVX1-NEXT: vzeroupper
845 ; AVX2-SLOW-LABEL: vf16:
846 ; AVX2-SLOW: # %bb.0:
847 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm9
848 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm2
849 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm3
850 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm4
851 ; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm8
852 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6
853 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm7
854 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
855 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7]
856 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
857 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm1
858 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0
859 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
860 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
861 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6]
862 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
863 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255>
864 ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm5, %ymm0
865 ; AVX2-SLOW-NEXT: vpbroadcastq (%r8), %ymm5
866 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255]
867 ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm5, %ymm5
868 ; AVX2-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm0
869 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13]
870 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
871 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
872 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,2,2,2]
873 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9]
874 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1],xmm6[2],xmm1[3],xmm6[4,5],xmm1[6],xmm6[7]
875 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0]
876 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255>
877 ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0
878 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm8[0,1,1,1]
879 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
880 ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm6
881 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[2,3,2,3,6,7,6,7]
882 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15]
883 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,2,6,7,6,6]
884 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
885 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2]
886 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[3,2,3,3,7,6,7,7]
887 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
888 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7]
889 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2],ymm1[3,4],ymm7[5,6,7,8],ymm1[9],ymm7[10],ymm1[11,12],ymm7[13,14,15]
890 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,2]
891 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u>
892 ; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0
893 ; AVX2-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm1
894 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0]
895 ; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0
896 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[1,1,1,2,5,5,5,6]
897 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25]
898 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5],ymm1[6],ymm7[7,8],ymm1[9],ymm7[10,11],ymm1[12],ymm7[13],ymm1[14],ymm7[15]
899 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
900 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[0,1,2,1,4,5,6,5]
901 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm2[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15]
902 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
903 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm10[0,1],ymm7[2],ymm10[3],ymm7[4],ymm10[5,6],ymm7[7],ymm10[8,9],ymm7[10],ymm10[11],ymm7[12],ymm10[13,14],ymm7[15]
904 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3]
905 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255>
906 ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm7, %ymm1
907 ; AVX2-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm7
908 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255]
909 ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm7, %ymm1
910 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u]
911 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4]
912 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
913 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u]
914 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm9[1,1,2,2]
915 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13],ymm2[14],ymm4[15]
916 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255>
917 ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
918 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm8[1,1,2,2]
919 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255]
920 ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
921 ; AVX2-SLOW-NEXT: vmovdqa %ymm2, 64(%r9)
922 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 96(%r9)
923 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 128(%r9)
924 ; AVX2-SLOW-NEXT: vmovdqa %ymm6, 32(%r9)
925 ; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%r9)
926 ; AVX2-SLOW-NEXT: vzeroupper
927 ; AVX2-SLOW-NEXT: retq
929 ; AVX2-FAST-LABEL: vf16:
930 ; AVX2-FAST: # %bb.0:
931 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm9
932 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm10
933 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm3
934 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm4
935 ; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm8
936 ; AVX2-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm5
937 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm6
938 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13]
939 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5],xmm5[6],xmm7[7]
940 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
941 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm7
942 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[1,2,2,2]
943 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm0
944 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9]
945 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
946 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0]
947 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255>
948 ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm5, %ymm1, %ymm1
949 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm8[0,1,1,1]
950 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
951 ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm5
952 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1
953 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
954 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13]
955 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
956 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
957 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7]
958 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
959 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255>
960 ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
961 ; AVX2-FAST-NEXT: vpbroadcastq (%r8), %ymm1
962 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255]
963 ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm6
964 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[2,3,2,3,6,7,6,7]
965 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,u,u,26,27,u,u,30,31,28,29,u,u,28,29]
966 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
967 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2]
968 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[3,2,3,3,7,6,7,7]
969 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31]
970 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3,4],ymm2[5,6,7,8],ymm1[9],ymm2[10],ymm1[11,12],ymm2[13,14,15]
971 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,2]
972 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u>
973 ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
974 ; AVX2-FAST-NEXT: vpbroadcastq 24(%r8), %ymm1
975 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0]
976 ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
977 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[1,1,1,2,5,5,5,6]
978 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25]
979 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
980 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
981 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[0,1,2,1,4,5,6,5]
982 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u]
983 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1],ymm2[2],ymm7[3],ymm2[4],ymm7[5,6],ymm2[7],ymm7[8,9],ymm2[10],ymm7[11],ymm2[12],ymm7[13,14],ymm2[15]
984 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3]
985 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255>
986 ; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1
987 ; AVX2-FAST-NEXT: vpbroadcastq 16(%r8), %ymm2
988 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255]
989 ; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1
990 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u]
991 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4]
992 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15]
993 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u]
994 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm9[1,1,2,2]
995 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15]
996 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255>
997 ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
998 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm8[1,1,2,2]
999 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255]
1000 ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
1001 ; AVX2-FAST-NEXT: vmovdqa %ymm2, 64(%r9)
1002 ; AVX2-FAST-NEXT: vmovdqa %ymm1, 96(%r9)
1003 ; AVX2-FAST-NEXT: vmovdqa %ymm0, 128(%r9)
1004 ; AVX2-FAST-NEXT: vmovdqa %ymm6, (%r9)
1005 ; AVX2-FAST-NEXT: vmovdqa %ymm5, 32(%r9)
1006 ; AVX2-FAST-NEXT: vzeroupper
1007 ; AVX2-FAST-NEXT: retq
1009 ; AVX512-LABEL: vf16:
1011 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0
1012 ; AVX512-NEXT: vmovdqa (%rdx), %ymm1
1013 ; AVX512-NEXT: vmovdqa (%r8), %ymm2
1014 ; AVX512-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
1015 ; AVX512-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1
1016 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = <u,13,29,45,61,u,14,30,46,62,u,15,31,47,63,u>
1017 ; AVX512-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
1018 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,16,32,48,u,1,17,33,49,u,2,18,34,50,u,3,19,35,51,u,4,20,36,52,u,5,21,37,53,u,6,22>
1019 ; AVX512-NEXT: vpermi2w %zmm1, %zmm0, %zmm4
1020 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31]
1021 ; AVX512-NEXT: vpermi2w %zmm2, %zmm4, %zmm5
1022 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = <6,22,u,39,55,7,23,u,40,56,8,24,u,41,57,9,25,u,42,58,10,26,u,43,59,11,27,u,44,60,12,28>
1023 ; AVX512-NEXT: vpermi2w %zmm0, %zmm1, %zmm4
1024 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31]
1025 ; AVX512-NEXT: vpermi2w %zmm2, %zmm4, %zmm0
1026 ; AVX512-NEXT: vmovdqu64 %zmm0, 64(%r9)
1027 ; AVX512-NEXT: vmovdqu64 %zmm5, (%r9)
1028 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31]
1029 ; AVX512-NEXT: vpermi2w %ymm2, %ymm3, %ymm0
1030 ; AVX512-NEXT: vmovdqa %ymm0, 128(%r9)
1031 ; AVX512-NEXT: vzeroupper
1033 %in.vec0 = load <16 x i16>, ptr %in.vecptr0, align 32
1034 %in.vec1 = load <16 x i16>, ptr %in.vecptr1, align 32
1035 %in.vec2 = load <16 x i16>, ptr %in.vecptr2, align 32
1036 %in.vec3 = load <16 x i16>, ptr %in.vecptr3, align 32
1037 %in.vec4 = load <16 x i16>, ptr %in.vecptr4, align 32
1039 %concat01 = shufflevector <16 x i16> %in.vec0, <16 x i16> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1040 %concat23 = shufflevector <16 x i16> %in.vec2, <16 x i16> %in.vec3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1041 %concat0123 = shufflevector <32 x i16> %concat01, <32 x i16> %concat23, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
1042 %concat4uuu = shufflevector <16 x i16> %in.vec4, <16 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1043 %concat01234 = shufflevector <64 x i16> %concat0123, <64 x i16> %concat4uuu, <80 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79>
1044 %interleaved.vec = shufflevector <80 x i16> %concat01234, <80 x i16> poison, <80 x i32> <i32 0, i32 16, i32 32, i32 48, i32 64, i32 1, i32 17, i32 33, i32 49, i32 65, i32 2, i32 18, i32 34, i32 50, i32 66, i32 3, i32 19, i32 35, i32 51, i32 67, i32 4, i32 20, i32 36, i32 52, i32 68, i32 5, i32 21, i32 37, i32 53, i32 69, i32 6, i32 22, i32 38, i32 54, i32 70, i32 7, i32 23, i32 39, i32 55, i32 71, i32 8, i32 24, i32 40, i32 56, i32 72, i32 9, i32 25, i32 41, i32 57, i32 73, i32 10, i32 26, i32 42, i32 58, i32 74, i32 11, i32 27, i32 43, i32 59, i32 75, i32 12, i32 28, i32 44, i32 60, i32 76, i32 13, i32 29, i32 45, i32 61, i32 77, i32 14, i32 30, i32 46, i32 62, i32 78, i32 15, i32 31, i32 47, i32 63, i32 79>
1046 store <80 x i16> %interleaved.vec, ptr %out.vec, align 32
1051 define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind {
1054 ; SSE-NEXT: subq $248, %rsp
1055 ; SSE-NEXT: movdqa (%rdi), %xmm0
1056 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1057 ; SSE-NEXT: movdqa 16(%rdi), %xmm10
1058 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1059 ; SSE-NEXT: movdqa (%rsi), %xmm13
1060 ; SSE-NEXT: movdqa 16(%rsi), %xmm9
1061 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1062 ; SSE-NEXT: movdqa (%rdx), %xmm14
1063 ; SSE-NEXT: movdqa (%rcx), %xmm11
1064 ; SSE-NEXT: movdqa 16(%rcx), %xmm12
1065 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1066 ; SSE-NEXT: movdqa (%r8), %xmm2
1067 ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,0,65535,65535,65535]
1068 ; SSE-NEXT: movdqa %xmm15, %xmm1
1069 ; SSE-NEXT: pandn %xmm0, %xmm1
1070 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[3,3,3,3,4,5,6,7]
1071 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
1072 ; SSE-NEXT: pand %xmm15, %xmm3
1073 ; SSE-NEXT: por %xmm1, %xmm3
1074 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,0,0,65535,65535]
1075 ; SSE-NEXT: movdqa %xmm1, %xmm4
1076 ; SSE-NEXT: pandn %xmm3, %xmm4
1077 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[1,1,2,2]
1078 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,65535,65535,65535,0]
1079 ; SSE-NEXT: pand %xmm8, %xmm5
1080 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[3,3,3,3,4,5,6,7]
1081 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4]
1082 ; SSE-NEXT: movdqa %xmm8, %xmm7
1083 ; SSE-NEXT: pandn %xmm6, %xmm7
1084 ; SSE-NEXT: por %xmm5, %xmm7
1085 ; SSE-NEXT: pand %xmm1, %xmm7
1086 ; SSE-NEXT: por %xmm4, %xmm7
1087 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,65535,65535,65535,65535]
1088 ; SSE-NEXT: pand %xmm6, %xmm7
1089 ; SSE-NEXT: movdqa %xmm6, %xmm0
1090 ; SSE-NEXT: pandn %xmm2, %xmm0
1091 ; SSE-NEXT: movdqa %xmm2, %xmm3
1092 ; SSE-NEXT: por %xmm7, %xmm0
1093 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1094 ; SSE-NEXT: movdqa %xmm15, %xmm4
1095 ; SSE-NEXT: pandn %xmm10, %xmm4
1096 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[3,3,3,3,4,5,6,7]
1097 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4]
1098 ; SSE-NEXT: pand %xmm15, %xmm5
1099 ; SSE-NEXT: por %xmm4, %xmm5
1100 ; SSE-NEXT: movdqa %xmm1, %xmm7
1101 ; SSE-NEXT: pandn %xmm5, %xmm7
1102 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[3,3,3,3,4,5,6,7]
1103 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4]
1104 ; SSE-NEXT: movdqa %xmm8, %xmm5
1105 ; SSE-NEXT: pandn %xmm4, %xmm5
1106 ; SSE-NEXT: movdqa 16(%rdx), %xmm10
1107 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,2,2]
1108 ; SSE-NEXT: pand %xmm8, %xmm0
1109 ; SSE-NEXT: por %xmm0, %xmm5
1110 ; SSE-NEXT: pand %xmm1, %xmm5
1111 ; SSE-NEXT: por %xmm7, %xmm5
1112 ; SSE-NEXT: movdqa 16(%r8), %xmm2
1113 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1114 ; SSE-NEXT: pand %xmm6, %xmm5
1115 ; SSE-NEXT: movdqa %xmm6, %xmm0
1116 ; SSE-NEXT: pandn %xmm2, %xmm0
1117 ; SSE-NEXT: por %xmm5, %xmm0
1118 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1119 ; SSE-NEXT: movdqa 32(%rdi), %xmm2
1120 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1121 ; SSE-NEXT: movdqa %xmm15, %xmm0
1122 ; SSE-NEXT: pandn %xmm2, %xmm0
1123 ; SSE-NEXT: movdqa 32(%rsi), %xmm2
1124 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1125 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[3,3,3,3,4,5,6,7]
1126 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4]
1127 ; SSE-NEXT: pand %xmm15, %xmm5
1128 ; SSE-NEXT: por %xmm0, %xmm5
1129 ; SSE-NEXT: movdqa %xmm1, %xmm0
1130 ; SSE-NEXT: pandn %xmm5, %xmm0
1131 ; SSE-NEXT: movdqa 32(%rcx), %xmm2
1132 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1133 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[3,3,3,3,4,5,6,7]
1134 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4]
1135 ; SSE-NEXT: movdqa %xmm8, %xmm7
1136 ; SSE-NEXT: pandn %xmm5, %xmm7
1137 ; SSE-NEXT: movdqa 32(%rdx), %xmm12
1138 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[1,1,2,2]
1139 ; SSE-NEXT: pand %xmm8, %xmm5
1140 ; SSE-NEXT: por %xmm5, %xmm7
1141 ; SSE-NEXT: pand %xmm1, %xmm7
1142 ; SSE-NEXT: por %xmm0, %xmm7
1143 ; SSE-NEXT: pand %xmm6, %xmm7
1144 ; SSE-NEXT: movdqa 32(%r8), %xmm2
1145 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1146 ; SSE-NEXT: movdqa %xmm6, %xmm0
1147 ; SSE-NEXT: pandn %xmm2, %xmm0
1148 ; SSE-NEXT: por %xmm7, %xmm0
1149 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1150 ; SSE-NEXT: movdqa 48(%rdi), %xmm2
1151 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1152 ; SSE-NEXT: movdqa %xmm15, %xmm0
1153 ; SSE-NEXT: pandn %xmm2, %xmm0
1154 ; SSE-NEXT: movdqa 48(%rsi), %xmm2
1155 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1156 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[3,3,3,3,4,5,6,7]
1157 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4]
1158 ; SSE-NEXT: pand %xmm15, %xmm5
1159 ; SSE-NEXT: por %xmm0, %xmm5
1160 ; SSE-NEXT: movdqa 48(%rcx), %xmm0
1161 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1162 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
1163 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
1164 ; SSE-NEXT: movdqa %xmm8, %xmm7
1165 ; SSE-NEXT: pandn %xmm0, %xmm7
1166 ; SSE-NEXT: movdqa 48(%rdx), %xmm0
1167 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1168 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
1169 ; SSE-NEXT: pand %xmm8, %xmm0
1170 ; SSE-NEXT: por %xmm0, %xmm7
1171 ; SSE-NEXT: pand %xmm1, %xmm7
1172 ; SSE-NEXT: pandn %xmm5, %xmm1
1173 ; SSE-NEXT: por %xmm7, %xmm1
1174 ; SSE-NEXT: pand %xmm6, %xmm1
1175 ; SSE-NEXT: movdqa 48(%r8), %xmm0
1176 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1177 ; SSE-NEXT: pandn %xmm0, %xmm6
1178 ; SSE-NEXT: por %xmm1, %xmm6
1179 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1180 ; SSE-NEXT: movdqa %xmm14, %xmm0
1181 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
1182 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7]
1183 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
1184 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,0,65535,65535,65535,0]
1185 ; SSE-NEXT: movdqa %xmm1, %xmm5
1186 ; SSE-NEXT: pandn %xmm0, %xmm5
1187 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
1188 ; SSE-NEXT: movdqa %xmm6, %xmm0
1189 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
1190 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[0,1,3,2,4,5,6,7]
1191 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,1]
1192 ; SSE-NEXT: pand %xmm1, %xmm7
1193 ; SSE-NEXT: por %xmm5, %xmm7
1194 ; SSE-NEXT: pand %xmm15, %xmm7
1195 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,1]
1196 ; SSE-NEXT: movdqa %xmm15, %xmm4
1197 ; SSE-NEXT: pandn %xmm2, %xmm4
1198 ; SSE-NEXT: por %xmm7, %xmm4
1199 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1200 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6]
1201 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1202 ; SSE-NEXT: movdqa %xmm11, %xmm5
1203 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3]
1204 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7]
1205 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,6,7]
1206 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,2,2,3]
1207 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
1208 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,65535,65535,0,65535]
1209 ; SSE-NEXT: movdqa %xmm4, %xmm0
1210 ; SSE-NEXT: pandn %xmm2, %xmm0
1211 ; SSE-NEXT: pand %xmm4, %xmm7
1212 ; SSE-NEXT: por %xmm7, %xmm0
1213 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1214 ; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7]
1215 ; SSE-NEXT: movdqa %xmm14, %xmm2
1216 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
1217 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,65535,65535,65535]
1218 ; SSE-NEXT: movdqa %xmm0, %xmm7
1219 ; SSE-NEXT: pandn %xmm2, %xmm7
1220 ; SSE-NEXT: movdqa %xmm6, %xmm2
1221 ; SSE-NEXT: movdqa %xmm13, %xmm5
1222 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7]
1223 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7]
1224 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,2]
1225 ; SSE-NEXT: pand %xmm0, %xmm2
1226 ; SSE-NEXT: por %xmm7, %xmm2
1227 ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [0,65535,65535,65535,65535,0,65535,65535]
1228 ; SSE-NEXT: pand %xmm13, %xmm2
1229 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,3,2,3]
1230 ; SSE-NEXT: movdqa %xmm13, %xmm3
1231 ; SSE-NEXT: pandn %xmm11, %xmm3
1232 ; SSE-NEXT: por %xmm2, %xmm3
1233 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1234 ; SSE-NEXT: movdqa %xmm5, %xmm2
1235 ; SSE-NEXT: psrlq $48, %xmm2
1236 ; SSE-NEXT: movdqa %xmm6, %xmm3
1237 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1238 ; SSE-NEXT: movdqa %xmm0, %xmm2
1239 ; SSE-NEXT: pandn %xmm3, %xmm2
1240 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,4,5,7,6]
1241 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3]
1242 ; SSE-NEXT: pand %xmm0, %xmm6
1243 ; SSE-NEXT: por %xmm2, %xmm6
1244 ; SSE-NEXT: movdqa %xmm8, %xmm2
1245 ; SSE-NEXT: pandn %xmm11, %xmm2
1246 ; SSE-NEXT: pand %xmm8, %xmm6
1247 ; SSE-NEXT: por %xmm6, %xmm2
1248 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1249 ; SSE-NEXT: movdqa %xmm10, %xmm2
1250 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
1251 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
1252 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7]
1253 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1]
1254 ; SSE-NEXT: movdqa %xmm1, %xmm6
1255 ; SSE-NEXT: pandn %xmm2, %xmm6
1256 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
1257 ; SSE-NEXT: movdqa %xmm9, %xmm2
1258 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
1259 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
1260 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,3,2,4,5,6,7]
1261 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,1,1]
1262 ; SSE-NEXT: pand %xmm1, %xmm4
1263 ; SSE-NEXT: por %xmm6, %xmm4
1264 ; SSE-NEXT: pand %xmm15, %xmm4
1265 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
1266 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm11[0,1,0,1]
1267 ; SSE-NEXT: movdqa %xmm15, %xmm3
1268 ; SSE-NEXT: pandn %xmm6, %xmm3
1269 ; SSE-NEXT: por %xmm4, %xmm3
1270 ; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill
1271 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,6]
1272 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
1273 ; SSE-NEXT: movdqa %xmm7, %xmm4
1274 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3]
1275 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7]
1276 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,6,7]
1277 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1278 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
1279 ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,65535,65535,0,65535]
1280 ; SSE-NEXT: movdqa %xmm14, %xmm2
1281 ; SSE-NEXT: pandn %xmm6, %xmm2
1282 ; SSE-NEXT: pand %xmm14, %xmm4
1283 ; SSE-NEXT: por %xmm4, %xmm2
1284 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1285 ; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
1286 ; SSE-NEXT: movdqa %xmm10, %xmm2
1287 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
1288 ; SSE-NEXT: movdqa %xmm0, %xmm4
1289 ; SSE-NEXT: pandn %xmm2, %xmm4
1290 ; SSE-NEXT: movdqa %xmm9, %xmm3
1291 ; SSE-NEXT: movdqa %xmm9, %xmm2
1292 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
1293 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7]
1294 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,2]
1295 ; SSE-NEXT: pand %xmm0, %xmm2
1296 ; SSE-NEXT: por %xmm4, %xmm2
1297 ; SSE-NEXT: pand %xmm13, %xmm2
1298 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,3,2,3]
1299 ; SSE-NEXT: movdqa %xmm13, %xmm6
1300 ; SSE-NEXT: pandn %xmm4, %xmm6
1301 ; SSE-NEXT: por %xmm2, %xmm6
1302 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1303 ; SSE-NEXT: movdqa %xmm5, %xmm2
1304 ; SSE-NEXT: psrlq $48, %xmm2
1305 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1306 ; SSE-NEXT: movdqa %xmm0, %xmm2
1307 ; SSE-NEXT: pandn %xmm3, %xmm2
1308 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm10[0,1,2,3,4,5,7,6]
1309 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3]
1310 ; SSE-NEXT: pand %xmm0, %xmm6
1311 ; SSE-NEXT: por %xmm2, %xmm6
1312 ; SSE-NEXT: movdqa %xmm8, %xmm2
1313 ; SSE-NEXT: pandn %xmm4, %xmm2
1314 ; SSE-NEXT: pand %xmm8, %xmm6
1315 ; SSE-NEXT: por %xmm6, %xmm2
1316 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1317 ; SSE-NEXT: movdqa %xmm12, %xmm2
1318 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
1319 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
1320 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7]
1321 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1]
1322 ; SSE-NEXT: movdqa %xmm1, %xmm4
1323 ; SSE-NEXT: pandn %xmm2, %xmm4
1324 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
1325 ; SSE-NEXT: movdqa %xmm9, %xmm2
1326 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1327 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1328 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[0,1,3,2,4,5,6,7]
1329 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,1,1]
1330 ; SSE-NEXT: pand %xmm1, %xmm6
1331 ; SSE-NEXT: por %xmm4, %xmm6
1332 ; SSE-NEXT: pand %xmm15, %xmm6
1333 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
1334 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,1,0,1]
1335 ; SSE-NEXT: movdqa %xmm15, %xmm7
1336 ; SSE-NEXT: pandn %xmm4, %xmm7
1337 ; SSE-NEXT: por %xmm6, %xmm7
1338 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1339 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,6]
1340 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
1341 ; SSE-NEXT: movdqa %xmm5, %xmm6
1342 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
1343 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,2,2,2,4,5,6,7]
1344 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,6,7]
1345 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
1346 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
1347 ; SSE-NEXT: movdqa %xmm14, %xmm11
1348 ; SSE-NEXT: pandn %xmm4, %xmm11
1349 ; SSE-NEXT: pand %xmm14, %xmm6
1350 ; SSE-NEXT: por %xmm6, %xmm11
1351 ; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7]
1352 ; SSE-NEXT: movdqa %xmm12, %xmm4
1353 ; SSE-NEXT: movdqa %xmm12, %xmm5
1354 ; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
1355 ; SSE-NEXT: movdqa %xmm0, %xmm6
1356 ; SSE-NEXT: pandn %xmm4, %xmm6
1357 ; SSE-NEXT: movdqa %xmm9, %xmm4
1358 ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
1359 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7]
1360 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,2]
1361 ; SSE-NEXT: pand %xmm0, %xmm4
1362 ; SSE-NEXT: por %xmm6, %xmm4
1363 ; SSE-NEXT: pand %xmm13, %xmm4
1364 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[2,3,2,3]
1365 ; SSE-NEXT: movdqa %xmm13, %xmm12
1366 ; SSE-NEXT: pandn %xmm6, %xmm12
1367 ; SSE-NEXT: por %xmm4, %xmm12
1368 ; SSE-NEXT: psrlq $48, %xmm3
1369 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm3[1]
1370 ; SSE-NEXT: movdqa %xmm0, %xmm4
1371 ; SSE-NEXT: pandn %xmm9, %xmm4
1372 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,5,7,6]
1373 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3]
1374 ; SSE-NEXT: pand %xmm0, %xmm2
1375 ; SSE-NEXT: por %xmm4, %xmm2
1376 ; SSE-NEXT: movdqa %xmm8, %xmm9
1377 ; SSE-NEXT: pandn %xmm6, %xmm9
1378 ; SSE-NEXT: pand %xmm8, %xmm2
1379 ; SSE-NEXT: por %xmm2, %xmm9
1380 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1381 ; SSE-NEXT: movdqa %xmm3, %xmm2
1382 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
1383 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
1384 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7]
1385 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1]
1386 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
1387 ; SSE-NEXT: movdqa %xmm10, %xmm4
1388 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
1389 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
1390 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm4[0,1,3,2,4,5,6,7]
1391 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,1,1]
1392 ; SSE-NEXT: pand %xmm1, %xmm6
1393 ; SSE-NEXT: pandn %xmm2, %xmm1
1394 ; SSE-NEXT: por %xmm6, %xmm1
1395 ; SSE-NEXT: pand %xmm15, %xmm1
1396 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
1397 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,1]
1398 ; SSE-NEXT: pandn %xmm2, %xmm15
1399 ; SSE-NEXT: por %xmm1, %xmm15
1400 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,5,6,6]
1401 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1402 ; SSE-NEXT: movdqa %xmm14, %xmm4
1403 ; SSE-NEXT: movdqa %xmm3, %xmm6
1404 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
1405 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7]
1406 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,6,7]
1407 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1408 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1409 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,0,65535]
1410 ; SSE-NEXT: pand %xmm3, %xmm4
1411 ; SSE-NEXT: pandn %xmm2, %xmm3
1412 ; SSE-NEXT: por %xmm4, %xmm3
1413 ; SSE-NEXT: movdqa %xmm6, %xmm2
1414 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
1415 ; SSE-NEXT: movdqa %xmm2, %xmm1
1416 ; SSE-NEXT: movdqa %xmm2, %xmm14
1417 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
1418 ; SSE-NEXT: movdqa %xmm0, %xmm2
1419 ; SSE-NEXT: pandn %xmm1, %xmm2
1420 ; SSE-NEXT: movdqa %xmm10, %xmm1
1421 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
1422 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7]
1423 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2]
1424 ; SSE-NEXT: pand %xmm0, %xmm1
1425 ; SSE-NEXT: por %xmm2, %xmm1
1426 ; SSE-NEXT: pand %xmm13, %xmm1
1427 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3]
1428 ; SSE-NEXT: pandn %xmm2, %xmm13
1429 ; SSE-NEXT: por %xmm1, %xmm13
1430 ; SSE-NEXT: psrlq $48, %xmm7
1431 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm7[1]
1432 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,4,5,7,6]
1433 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
1434 ; SSE-NEXT: pand %xmm0, %xmm1
1435 ; SSE-NEXT: pandn %xmm10, %xmm0
1436 ; SSE-NEXT: por %xmm1, %xmm0
1437 ; SSE-NEXT: pand %xmm8, %xmm0
1438 ; SSE-NEXT: pandn %xmm2, %xmm8
1439 ; SSE-NEXT: por %xmm0, %xmm8
1440 ; SSE-NEXT: movdqa %xmm8, 304(%r9)
1441 ; SSE-NEXT: movdqa %xmm13, 288(%r9)
1442 ; SSE-NEXT: movdqa %xmm3, 256(%r9)
1443 ; SSE-NEXT: movdqa %xmm15, 240(%r9)
1444 ; SSE-NEXT: movdqa %xmm9, 224(%r9)
1445 ; SSE-NEXT: movdqa %xmm12, 208(%r9)
1446 ; SSE-NEXT: movdqa %xmm11, 176(%r9)
1447 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1448 ; SSE-NEXT: movaps %xmm0, 160(%r9)
1449 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1450 ; SSE-NEXT: movaps %xmm0, 144(%r9)
1451 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1452 ; SSE-NEXT: movaps %xmm0, 128(%r9)
1453 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1454 ; SSE-NEXT: movaps %xmm0, 96(%r9)
1455 ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
1456 ; SSE-NEXT: movaps %xmm0, 80(%r9)
1457 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1458 ; SSE-NEXT: movaps %xmm0, 64(%r9)
1459 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1460 ; SSE-NEXT: movaps %xmm0, 48(%r9)
1461 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1462 ; SSE-NEXT: movaps %xmm0, 16(%r9)
1463 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1464 ; SSE-NEXT: movaps %xmm0, (%r9)
1465 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1466 ; SSE-NEXT: movaps %xmm0, 272(%r9)
1467 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1468 ; SSE-NEXT: movaps %xmm0, 192(%r9)
1469 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1470 ; SSE-NEXT: movaps %xmm0, 112(%r9)
1471 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1472 ; SSE-NEXT: movaps %xmm0, 32(%r9)
1473 ; SSE-NEXT: addq $248, %rsp
1478 ; AVX1-NEXT: subq $72, %rsp
1479 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm11
1480 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
1481 ; AVX1-NEXT: vmovdqa 32(%rsi), %xmm15
1482 ; AVX1-NEXT: vmovdqa 48(%rsi), %xmm5
1483 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[3,3,3,3,4,5,6,7]
1484 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
1485 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7]
1486 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
1487 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,5,6,6]
1488 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1489 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1490 ; AVX1-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535]
1491 ; AVX1-NEXT: vandnps %ymm0, %ymm12, %ymm1
1492 ; AVX1-NEXT: vmovdqa 32(%rdx), %xmm9
1493 ; AVX1-NEXT: vmovdqa 48(%rdx), %xmm0
1494 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2]
1495 ; AVX1-NEXT: vmovdqa 32(%rcx), %xmm6
1496 ; AVX1-NEXT: vmovdqa 48(%rcx), %xmm7
1497 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[3,3,3,3,4,5,6,7]
1498 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4]
1499 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4,5,6],xmm4[7]
1500 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
1501 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7]
1502 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,6,7]
1503 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
1504 ; AVX1-NEXT: vandps %ymm2, %ymm12, %ymm2
1505 ; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm10
1506 ; AVX1-NEXT: vmovdqa 48(%r8), %xmm1
1507 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
1508 ; AVX1-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
1509 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,6]
1510 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,3,3]
1511 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm4
1512 ; AVX1-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535]
1513 ; AVX1-NEXT: vandnps %ymm4, %ymm14, %ymm4
1514 ; AVX1-NEXT: vpsrlq $48, %xmm5, %xmm2
1515 ; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
1516 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
1517 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7]
1518 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,2]
1519 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1520 ; AVX1-NEXT: vandps %ymm2, %ymm14, %ymm2
1521 ; AVX1-NEXT: vorps %ymm4, %ymm2, %ymm2
1522 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
1523 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0],xmm2[1,2,3,4],xmm3[5],xmm2[6,7]
1524 ; AVX1-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1525 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
1526 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4,5,6],xmm3[7]
1527 ; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1528 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[1,1,2,2]
1529 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[3,3,3,3,4,5,6,7]
1530 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
1531 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4,5,6],xmm3[7]
1532 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
1533 ; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
1534 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
1535 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[3,3,3,3,4,5,6,7]
1536 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4]
1537 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm11[4],xmm4[5,6,7]
1538 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7]
1539 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7]
1540 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,2]
1541 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
1542 ; AVX1-NEXT: vmovaps {{.*#+}} ymm5 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535]
1543 ; AVX1-NEXT: vandnps %ymm2, %ymm5, %ymm2
1544 ; AVX1-NEXT: vandps %ymm5, %ymm4, %ymm4
1545 ; AVX1-NEXT: vorps %ymm2, %ymm4, %ymm5
1546 ; AVX1-NEXT: vpsrlq $48, %xmm15, %xmm2
1547 ; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm11[1],xmm2[1]
1548 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,1,3,2,4,5,6,7]
1549 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,1,1]
1550 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
1551 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
1552 ; AVX1-NEXT: vmovdqa 32(%r8), %xmm4
1553 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,6]
1554 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,3,3]
1555 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7]
1556 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
1557 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
1558 ; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm3
1559 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3],xmm3[4,5,6,7]
1560 ; AVX1-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill
1561 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
1562 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0],xmm1[1],xmm10[2,3,4,5],xmm1[6],xmm10[7]
1563 ; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1564 ; AVX1-NEXT: vandnps %ymm2, %ymm14, %ymm2
1565 ; AVX1-NEXT: vandps %ymm0, %ymm14, %ymm0
1566 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
1567 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm2
1568 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3]
1569 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4],xmm3[5],xmm2[6,7]
1570 ; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1571 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm3[2],xmm0[3,4,5,6],xmm3[7]
1572 ; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1573 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1574 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7]
1575 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1576 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
1577 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
1578 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7]
1579 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
1580 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
1581 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7]
1582 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1583 ; AVX1-NEXT: vmovdqa 16(%rdx), %xmm9
1584 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3]
1585 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,1,3,2,4,5,6,7]
1586 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,1,1]
1587 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6]
1588 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1589 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1590 ; AVX1-NEXT: vmovdqa 16(%rcx), %xmm6
1591 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm4[3],xmm5[4,5,6,7]
1592 ; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1593 ; AVX1-NEXT: vandnps %ymm0, %ymm12, %ymm0
1594 ; AVX1-NEXT: vandps %ymm1, %ymm12, %ymm1
1595 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
1596 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,0,1]
1597 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7]
1598 ; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1599 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1600 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
1601 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1602 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
1603 ; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
1604 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6]
1605 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
1606 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2
1607 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
1608 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm0
1609 ; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm4
1610 ; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm1[1],xmm4[1]
1611 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1612 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7]
1613 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,2]
1614 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
1615 ; AVX1-NEXT: vandnps %ymm2, %ymm14, %ymm2
1616 ; AVX1-NEXT: vandps %ymm4, %ymm14, %ymm4
1617 ; AVX1-NEXT: vorps %ymm2, %ymm4, %ymm2
1618 ; AVX1-NEXT: vmovdqa 16(%r8), %xmm8
1619 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[2,3,2,3]
1620 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm2[1,2,3,4],xmm4[5],xmm2[6,7]
1621 ; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1622 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
1623 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4,5,6],xmm4[7]
1624 ; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1625 ; AVX1-NEXT: vmovdqa (%rdi), %xmm10
1626 ; AVX1-NEXT: vmovdqa (%rsi), %xmm13
1627 ; AVX1-NEXT: vpsrlq $48, %xmm13, %xmm2
1628 ; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm10[1],xmm2[1]
1629 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1630 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[0,1,3,2,4,5,6,7]
1631 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,1,1]
1632 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
1633 ; AVX1-NEXT: vmovdqa (%rdx), %xmm3
1634 ; AVX1-NEXT: vmovdqa (%rcx), %xmm4
1635 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1636 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm15 = xmm12[0,1,2,3,4,5,7,6]
1637 ; AVX1-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,3,3]
1638 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
1639 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[0,1,2,2,4,5,6,7]
1640 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,2,1]
1641 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm15, %ymm5
1642 ; AVX1-NEXT: vandnps %ymm2, %ymm14, %ymm2
1643 ; AVX1-NEXT: vandps %ymm5, %ymm14, %ymm5
1644 ; AVX1-NEXT: vorps %ymm2, %ymm5, %ymm5
1645 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
1646 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
1647 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7]
1648 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,4,5,6,6]
1649 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1650 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1651 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[3,3,3,3,4,5,6,7]
1652 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
1653 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[1,1,2,2]
1654 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6],xmm1[7]
1655 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
1656 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7]
1657 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7]
1658 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1659 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535]
1660 ; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
1661 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
1662 ; AVX1-NEXT: vmovaps %ymm2, %ymm14
1663 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm2
1664 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm0
1665 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,1,0,1]
1666 ; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7]
1667 ; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7]
1668 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
1669 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[3],xmm2[4,5,6,7]
1670 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
1671 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,2,2,4,5,6,7]
1672 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,2,1]
1673 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
1674 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,3]
1675 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,6,7]
1676 ; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
1677 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3]
1678 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,1,3,2,4,5,6,7]
1679 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,1]
1680 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,6]
1681 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
1682 ; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0
1683 ; AVX1-NEXT: vandnps %ymm6, %ymm14, %ymm6
1684 ; AVX1-NEXT: vandps %ymm0, %ymm14, %ymm0
1685 ; AVX1-NEXT: vorps %ymm6, %ymm0, %ymm0
1686 ; AVX1-NEXT: vmovdqa (%r8), %xmm6
1687 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,0,1]
1688 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm7[4],xmm0[5,6,7]
1689 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1690 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3,4,5],xmm7[6],xmm0[7]
1691 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2]
1692 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,3,3,3,4,5,6,7]
1693 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4]
1694 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5,6],xmm4[7]
1695 ; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
1696 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
1697 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[3,3,3,3,4,5,6,7]
1698 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4]
1699 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm10[4],xmm4[5,6,7]
1700 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7]
1701 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7]
1702 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,2]
1703 ; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4
1704 ; AVX1-NEXT: vmovaps {{.*#+}} ymm7 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535]
1705 ; AVX1-NEXT: vandnps %ymm3, %ymm7, %ymm3
1706 ; AVX1-NEXT: vandps %ymm7, %ymm4, %ymm4
1707 ; AVX1-NEXT: vorps %ymm3, %ymm4, %ymm3
1708 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[2,3,2,3]
1709 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3,4,5,6],xmm4[7]
1710 ; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3],xmm3[4,5,6,7]
1711 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
1712 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3,4],xmm4[5],xmm3[6,7]
1713 ; AVX1-NEXT: vmovdqa %xmm3, 48(%r9)
1714 ; AVX1-NEXT: vmovdqa %xmm6, 32(%r9)
1715 ; AVX1-NEXT: vmovdqa %xmm0, 16(%r9)
1716 ; AVX1-NEXT: vmovdqa %xmm1, (%r9)
1717 ; AVX1-NEXT: vmovdqa %xmm2, 112(%r9)
1718 ; AVX1-NEXT: vmovdqa %xmm11, 96(%r9)
1719 ; AVX1-NEXT: vmovdqa %xmm9, 80(%r9)
1720 ; AVX1-NEXT: vmovdqa %xmm5, 64(%r9)
1721 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1722 ; AVX1-NEXT: vmovaps %xmm0, 144(%r9)
1723 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1724 ; AVX1-NEXT: vmovaps %xmm0, 128(%r9)
1725 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1726 ; AVX1-NEXT: vmovaps %xmm0, 176(%r9)
1727 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1728 ; AVX1-NEXT: vmovaps %xmm0, 160(%r9)
1729 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1730 ; AVX1-NEXT: vmovaps %xmm0, 240(%r9)
1731 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1732 ; AVX1-NEXT: vmovaps %xmm0, 224(%r9)
1733 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1734 ; AVX1-NEXT: vmovaps %xmm0, 208(%r9)
1735 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1736 ; AVX1-NEXT: vmovaps %xmm0, 192(%r9)
1737 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1738 ; AVX1-NEXT: vmovaps %xmm0, 304(%r9)
1739 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1740 ; AVX1-NEXT: vmovaps %xmm0, 288(%r9)
1741 ; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
1742 ; AVX1-NEXT: vmovaps %xmm0, 272(%r9)
1743 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1744 ; AVX1-NEXT: vmovaps %xmm0, 256(%r9)
1745 ; AVX1-NEXT: addq $72, %rsp
1746 ; AVX1-NEXT: vzeroupper
1749 ; AVX2-SLOW-LABEL: vf32:
1750 ; AVX2-SLOW: # %bb.0:
1751 ; AVX2-SLOW-NEXT: subq $40, %rsp
1752 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm8
1753 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2
1754 ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm3
1755 ; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm13
1756 ; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1757 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm10
1758 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm7
1759 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm11
1760 ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm4
1761 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
1762 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7]
1763 ; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm1
1764 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
1765 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm6
1766 ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm5
1767 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm12
1768 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0
1769 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
1770 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1771 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6]
1772 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1773 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255>
1774 ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm0
1775 ; AVX2-SLOW-NEXT: vpbroadcastq 32(%r8), %ymm1
1776 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255]
1777 ; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0
1778 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
1779 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
1780 ; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm1
1781 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
1782 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3]
1783 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1784 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6]
1785 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1786 ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm0
1787 ; AVX2-SLOW-NEXT: vpbroadcastq (%r8), %ymm1
1788 ; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0
1789 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1790 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13>
1791 ; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm6, %xmm0
1792 ; AVX2-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm1
1793 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
1794 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1795 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9>
1796 ; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm11, %xmm6
1797 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[1,2,2,2]
1798 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1],xmm6[2],xmm1[3],xmm6[4,5],xmm1[6],xmm6[7]
1799 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0]
1800 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255>
1801 ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm0
1802 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm13[0,1,1,1]
1803 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
1804 ; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0
1805 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1806 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[2,3,2,3,6,7,6,7]
1807 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15]
1808 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,2,6,7,6,6]
1809 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
1810 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm10
1811 ; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm5, %xmm1
1812 ; AVX2-SLOW-NEXT: vpbroadcastq 40(%rdi), %xmm5
1813 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5],xmm5[6],xmm1[7]
1814 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm6
1815 ; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm4
1816 ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm15
1817 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,2,2,2]
1818 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0],xmm5[1],xmm4[2],xmm5[3],xmm4[4,5],xmm5[6],xmm4[7]
1819 ; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm4
1820 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2]
1821 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
1822 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0]
1823 ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm5, %ymm1
1824 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm6[3,2,3,3,7,6,7,7]
1825 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm15[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
1826 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7]
1827 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2],ymm5[3,4],ymm7[5,6,7,8],ymm5[9],ymm7[10],ymm5[11,12],ymm7[13,14,15]
1828 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,2]
1829 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u>
1830 ; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm5, %ymm0
1831 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[0,1,1,1]
1832 ; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm5, %ymm1
1833 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1834 ; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm1
1835 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[2,3,2,3,6,7,6,7]
1836 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm10[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15]
1837 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,2,6,7,6,6]
1838 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4,5],ymm5[6],ymm9[7,8],ymm5[9],ymm9[10],ymm5[11],ymm9[12,13],ymm5[14],ymm9[15]
1839 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm12
1840 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm12[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
1841 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7]
1842 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm14
1843 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm14[3,2,3,3,7,6,7,7]
1844 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm11[1],ymm9[2],ymm11[3,4],ymm9[5,6,7,8],ymm11[9],ymm9[10],ymm11[11,12],ymm9[13,14,15]
1845 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,2]
1846 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2]
1847 ; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm5, %ymm9, %ymm7
1848 ; AVX2-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm5
1849 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0]
1850 ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm5, %ymm5
1851 ; AVX2-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm0
1852 ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm0, %ymm7
1853 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25>
1854 ; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm15, %ymm9
1855 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm6[1,1,1,2,5,5,5,6]
1856 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm11[1],ymm9[2,3],ymm11[4],ymm9[5],ymm11[6],ymm9[7,8],ymm11[9],ymm9[10,11],ymm11[12],ymm9[13],ymm11[14],ymm9[15]
1857 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm3[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15]
1858 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
1859 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm2[0,1,2,1,4,5,6,5]
1860 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3],ymm13[4],ymm11[5,6],ymm13[7],ymm11[8,9],ymm13[10],ymm11[11],ymm13[12],ymm11[13,14],ymm13[15]
1861 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3]
1862 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3]
1863 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255>
1864 ; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm9, %ymm11, %ymm9
1865 ; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm12, %ymm0
1866 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm14[1,1,1,2,5,5,5,6]
1867 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3],ymm11[4],ymm0[5],ymm11[6],ymm0[7,8],ymm11[9],ymm0[10,11],ymm11[12],ymm0[13],ymm11[14],ymm0[15]
1868 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm10[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15]
1869 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
1870 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,1,2,1,4,5,6,5]
1871 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3],ymm8[4],ymm11[5,6],ymm8[7],ymm11[8,9],ymm8[10],ymm11[11],ymm8[12],ymm11[13,14],ymm8[15]
1872 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
1873 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
1874 ; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm8, %ymm0
1875 ; AVX2-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm8
1876 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255]
1877 ; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm9, %ymm8, %ymm8
1878 ; AVX2-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm9
1879 ; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm9, %ymm0
1880 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u>
1881 ; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm15, %ymm11
1882 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[3,0,3,0,7,4,7,4]
1883 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm11[1],ymm6[2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7,8],ymm11[9],ymm6[10],ymm11[11],ymm6[12,13],ymm11[14],ymm6[15]
1884 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u>
1885 ; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm3, %ymm3
1886 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,2,2]
1887 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15]
1888 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255>
1889 ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
1890 ; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm12, %ymm6
1891 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[3,0,3,0,7,4,7,4]
1892 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2],ymm6[3],ymm9[4,5],ymm6[6],ymm9[7,8],ymm6[9],ymm9[10],ymm6[11],ymm9[12,13],ymm6[14],ymm9[15]
1893 ; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm10, %ymm9
1894 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm1[1,1,2,2]
1895 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15]
1896 ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm6, %ymm9, %ymm3
1897 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,1,2,2]
1898 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255]
1899 ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm2
1900 ; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
1901 ; AVX2-SLOW-NEXT: # ymm4 = mem[1,1,2,2]
1902 ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3
1903 ; AVX2-SLOW-NEXT: vmovdqa %ymm3, 64(%r9)
1904 ; AVX2-SLOW-NEXT: vmovdqa %ymm2, 224(%r9)
1905 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 96(%r9)
1906 ; AVX2-SLOW-NEXT: vmovdqa %ymm7, 128(%r9)
1907 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1908 ; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%r9)
1909 ; AVX2-SLOW-NEXT: vmovdqa %ymm5, 288(%r9)
1910 ; AVX2-SLOW-NEXT: vmovdqa %ymm8, 256(%r9)
1911 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1912 ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%r9)
1913 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1914 ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%r9)
1915 ; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
1916 ; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%r9)
1917 ; AVX2-SLOW-NEXT: addq $40, %rsp
1918 ; AVX2-SLOW-NEXT: vzeroupper
1919 ; AVX2-SLOW-NEXT: retq
1921 ; AVX2-FAST-LABEL: vf32:
1922 ; AVX2-FAST: # %bb.0:
1923 ; AVX2-FAST-NEXT: subq $40, %rsp
1924 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm14
1925 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm12
1926 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm11
1927 ; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm8
1928 ; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1929 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm4
1930 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm13
1931 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13>
1932 ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm5
1933 ; AVX2-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm6
1934 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7]
1935 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm5[0,1,0,1]
1936 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm5
1937 ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm6
1938 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9>
1939 ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm1
1940 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm7
1941 ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm0
1942 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[1,2,2,2]
1943 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7]
1944 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0]
1945 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255>
1946 ; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm9, %ymm1, %ymm1
1947 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm8[0,1,1,1]
1948 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
1949 ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm9, %ymm1
1950 ; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
1951 ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm1
1952 ; AVX2-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm3
1953 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7]
1954 ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm3
1955 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,2,2,2]
1956 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7]
1957 ; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm3
1958 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
1959 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0]
1960 ; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm1
1961 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm3[0,1,1,1]
1962 ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm1
1963 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1964 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm1
1965 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3]
1966 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13]
1967 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1968 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
1969 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
1970 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7]
1971 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0
1972 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1973 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255>
1974 ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0
1975 ; AVX2-FAST-NEXT: vpbroadcastq 32(%r8), %ymm1
1976 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255]
1977 ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0
1978 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1979 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1
1980 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
1981 ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm9
1982 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1983 ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm10
1984 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
1985 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
1986 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2
1987 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
1988 ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm1
1989 ; AVX2-FAST-NEXT: vpbroadcastq (%r8), %ymm2
1990 ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm2, %ymm0
1991 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1992 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29>
1993 ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm2
1994 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[2,3,2,3,6,7,6,7]
1995 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7,8],ymm4[9],ymm2[10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15]
1996 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,2]
1997 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31>
1998 ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm4
1999 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[3,2,3,3,7,6,7,7]
2000 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2],ymm6[3,4],ymm4[5,6,7,8],ymm6[9],ymm4[10],ymm6[11,12],ymm4[13,14,15]
2001 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,2]
2002 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u>
2003 ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm2
2004 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm4
2005 ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm1
2006 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[2,3,2,3,6,7,6,7]
2007 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2],ymm7[3],ymm1[4,5],ymm7[6],ymm1[7,8],ymm7[9],ymm1[10],ymm7[11],ymm1[12,13],ymm7[14],ymm1[15]
2008 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm7
2009 ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm5
2010 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm13
2011 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm13[3,2,3,3,7,6,7,7]
2012 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2],ymm15[3,4],ymm5[5,6,7,8],ymm15[9],ymm5[10],ymm15[11,12],ymm5[13,14,15]
2013 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2]
2014 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,2]
2015 ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1
2016 ; AVX2-FAST-NEXT: vpbroadcastq 56(%r8), %ymm5
2017 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0]
2018 ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm5
2019 ; AVX2-FAST-NEXT: vpbroadcastq 24(%r8), %ymm2
2020 ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1
2021 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25>
2022 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm6
2023 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm9[1,1,1,2,5,5,5,6]
2024 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm15[1],ymm6[2,3],ymm15[4],ymm6[5],ymm15[6],ymm6[7,8],ymm15[9],ymm6[10,11],ymm15[12],ymm6[13],ymm15[14],ymm6[15]
2025 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u>
2026 ; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm11, %ymm0
2027 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[0,1,2,1,4,5,6,5]
2028 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3],ymm8[4],ymm0[5,6],ymm8[7],ymm0[8,9],ymm8[10],ymm0[11],ymm8[12],ymm0[13,14],ymm8[15]
2029 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3]
2030 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
2031 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255>
2032 ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm0, %ymm0
2033 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm2
2034 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[1,1,1,2,5,5,5,6]
2035 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3],ymm6[4],ymm2[5],ymm6[6],ymm2[7,8],ymm6[9],ymm2[10,11],ymm6[12],ymm2[13],ymm6[14],ymm2[15]
2036 ; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm4, %ymm6
2037 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm14[0,1,2,1,4,5,6,5]
2038 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm15[2],ymm6[3],ymm15[4],ymm6[5,6],ymm15[7],ymm6[8,9],ymm15[10],ymm6[11],ymm15[12],ymm6[13,14],ymm15[15]
2039 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3]
2040 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3]
2041 ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm6, %ymm2
2042 ; AVX2-FAST-NEXT: vpbroadcastq 48(%r8), %ymm6
2043 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255]
2044 ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm6, %ymm0
2045 ; AVX2-FAST-NEXT: vpbroadcastq 16(%r8), %ymm6
2046 ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm6, %ymm2
2047 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u>
2048 ; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm10, %ymm8
2049 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[3,0,3,0,7,4,7,4]
2050 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15]
2051 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u>
2052 ; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm10
2053 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm12[1,1,2,2]
2054 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
2055 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255>
2056 ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8
2057 ; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm6
2058 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm13[3,0,3,0,7,4,7,4]
2059 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15]
2060 ; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4
2061 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm14[1,1,2,2]
2062 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5],ymm4[6],ymm7[7,8],ymm4[9],ymm7[10,11],ymm4[12],ymm7[13],ymm4[14],ymm7[15]
2063 ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm6, %ymm4, %ymm4
2064 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,2,2]
2065 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255]
2066 ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm8, %ymm3, %ymm3
2067 ; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
2068 ; AVX2-FAST-NEXT: # ymm7 = mem[1,1,2,2]
2069 ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm7, %ymm4
2070 ; AVX2-FAST-NEXT: vmovdqa %ymm4, 64(%r9)
2071 ; AVX2-FAST-NEXT: vmovdqa %ymm3, 224(%r9)
2072 ; AVX2-FAST-NEXT: vmovdqa %ymm2, 96(%r9)
2073 ; AVX2-FAST-NEXT: vmovdqa %ymm1, 128(%r9)
2074 ; AVX2-FAST-NEXT: vmovdqa %ymm5, 288(%r9)
2075 ; AVX2-FAST-NEXT: vmovdqa %ymm0, 256(%r9)
2076 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2077 ; AVX2-FAST-NEXT: vmovaps %ymm0, (%r9)
2078 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2079 ; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%r9)
2080 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2081 ; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%r9)
2082 ; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2083 ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%r9)
2084 ; AVX2-FAST-NEXT: addq $40, %rsp
2085 ; AVX2-FAST-NEXT: vzeroupper
2086 ; AVX2-FAST-NEXT: retq
2088 ; AVX512-LABEL: vf32:
2090 ; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
2091 ; AVX512-NEXT: vmovdqu64 (%rsi), %zmm1
2092 ; AVX512-NEXT: vmovdqu64 (%rdx), %zmm2
2093 ; AVX512-NEXT: vmovdqu64 (%rcx), %zmm3
2094 ; AVX512-NEXT: vmovdqu64 (%r8), %zmm4
2095 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = <u,u,0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u>
2096 ; AVX512-NEXT: vpermi2w %zmm3, %zmm2, %zmm5
2097 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u,6,38>
2098 ; AVX512-NEXT: vpermi2w %zmm1, %zmm0, %zmm6
2099 ; AVX512-NEXT: movl $415641996, %eax # imm = 0x18C6318C
2100 ; AVX512-NEXT: kmovd %eax, %k1
2101 ; AVX512-NEXT: vmovdqu16 %zmm5, %zmm6 {%k1}
2102 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31]
2103 ; AVX512-NEXT: vpermi2w %zmm4, %zmm6, %zmm5
2104 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = <u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44,u,u>
2105 ; AVX512-NEXT: vpermi2w %zmm1, %zmm0, %zmm6
2106 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = <6,38,u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44>
2107 ; AVX512-NEXT: vpermi2w %zmm3, %zmm2, %zmm7
2108 ; AVX512-NEXT: movl $831283992, %eax # imm = 0x318C6318
2109 ; AVX512-NEXT: kmovd %eax, %k2
2110 ; AVX512-NEXT: vmovdqu16 %zmm6, %zmm7 {%k2}
2111 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31]
2112 ; AVX512-NEXT: vpermi2w %zmm4, %zmm7, %zmm6
2113 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = <u,u,u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u>
2114 ; AVX512-NEXT: vpermi2w %zmm3, %zmm2, %zmm7
2115 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = <u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u,u,19>
2116 ; AVX512-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
2117 ; AVX512-NEXT: vmovdqu16 %zmm7, %zmm8 {%k2}
2118 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31]
2119 ; AVX512-NEXT: vpermi2w %zmm4, %zmm8, %zmm7
2120 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = <u,19,51,u,u,u,20,52,u,u,u,21,53,u,u,u,22,54,u,u,u,23,55,u,u,u,24,56,u,u,u,25>
2121 ; AVX512-NEXT: vpermi2w %zmm3, %zmm2, %zmm8
2122 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = <19,u,u,u,52,20,u,u,u,53,21,u,u,u,54,22,u,u,u,55,23,u,u,u,56,24,u,u,u,57,25,u>
2123 ; AVX512-NEXT: vpermi2w %zmm0, %zmm1, %zmm9
2124 ; AVX512-NEXT: movl $-1939662650, %eax # imm = 0x8C6318C6
2125 ; AVX512-NEXT: kmovd %eax, %k2
2126 ; AVX512-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2}
2127 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31]
2128 ; AVX512-NEXT: vpermi2w %zmm4, %zmm9, %zmm8
2129 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = <u,u,26,58,u,u,u,27,59,u,u,u,28,60,u,u,u,29,61,u,u,u,30,62,u,u,u,31,63,u,u,u>
2130 ; AVX512-NEXT: vpermi2w %zmm1, %zmm0, %zmm9
2131 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = <25,u,u,u,58,26,u,u,u,59,27,u,u,u,60,28,u,u,u,61,29,u,u,u,62,30,u,u,u,63,31,u>
2132 ; AVX512-NEXT: vpermi2w %zmm2, %zmm3, %zmm0
2133 ; AVX512-NEXT: vmovdqu16 %zmm9, %zmm0 {%k1}
2134 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63]
2135 ; AVX512-NEXT: vpermi2w %zmm4, %zmm0, %zmm1
2136 ; AVX512-NEXT: vmovdqu64 %zmm1, 256(%r9)
2137 ; AVX512-NEXT: vmovdqu64 %zmm8, 192(%r9)
2138 ; AVX512-NEXT: vmovdqu64 %zmm7, 128(%r9)
2139 ; AVX512-NEXT: vmovdqu64 %zmm6, 64(%r9)
2140 ; AVX512-NEXT: vmovdqu64 %zmm5, (%r9)
2141 ; AVX512-NEXT: vzeroupper
2143 %in.vec0 = load <32 x i16>, ptr %in.vecptr0, align 32
2144 %in.vec1 = load <32 x i16>, ptr %in.vecptr1, align 32
2145 %in.vec2 = load <32 x i16>, ptr %in.vecptr2, align 32
2146 %in.vec3 = load <32 x i16>, ptr %in.vecptr3, align 32
2147 %in.vec4 = load <32 x i16>, ptr %in.vecptr4, align 32
2149 %concat01 = shufflevector <32 x i16> %in.vec0, <32 x i16> %in.vec1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
2150 %concat23 = shufflevector <32 x i16> %in.vec2, <32 x i16> %in.vec3, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
2151 %concat0123 = shufflevector <64 x i16> %concat01, <64 x i16> %concat23, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
2152 %concat4uuu = shufflevector <32 x i16> %in.vec4, <32 x i16> poison, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2153 %concat01234 = shufflevector <128 x i16> %concat0123, <128 x i16> %concat4uuu, <160 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159>
2154 %interleaved.vec = shufflevector <160 x i16> %concat01234, <160 x i16> poison, <160 x i32> <i32 0, i32 32, i32 64, i32 96, i32 128, i32 1, i32 33, i32 65, i32 97, i32 129, i32 2, i32 34, i32 66, i32 98, i32 130, i32 3, i32 35, i32 67, i32 99, i32 131, i32 4, i32 36, i32 68, i32 100, i32 132, i32 5, i32 37, i32 69, i32 101, i32 133, i32 6, i32 38, i32 70, i32 102, i32 134, i32 7, i32 39, i32 71, i32 103, i32 135, i32 8, i32 40, i32 72, i32 104, i32 136, i32 9, i32 41, i32 73, i32 105, i32 137, i32 10, i32 42, i32 74, i32 106, i32 138, i32 11, i32 43, i32 75, i32 107, i32 139, i32 12, i32 44, i32 76, i32 108, i32 140, i32 13, i32 45, i32 77, i32 109, i32 141, i32 14, i32 46, i32 78, i32 110, i32 142, i32 15, i32 47, i32 79, i32 111, i32 143, i32 16, i32 48, i32 80, i32 112, i32 144, i32 17, i32 49, i32 81, i32 113, i32 145, i32 18, i32 50, i32 82, i32 114, i32 146, i32 19, i32 51, i32 83, i32 115, i32 147, i32 20, i32 52, i32 84, i32 116, i32 148, i32 21, i32 53, i32 85, i32 117, i32 149, i32 22, i32 54, i32 86, i32 118, i32 150, i32 23, i32 55, i32 87, i32 119, i32 151, i32 24, i32 56, i32 88, i32 120, i32 152, i32 25, i32 57, i32 89, i32 121, i32 153, i32 26, i32 58, i32 90, i32 122, i32 154, i32 27, i32 59, i32 91, i32 123, i32 155, i32 28, i32 60, i32 92, i32 124, i32 156, i32 29, i32 61, i32 93, i32 125, i32 157, i32 30, i32 62, i32 94, i32 126, i32 158, i32 31, i32 63, i32 95, i32 127, i32 159>
2156 store <160 x i16> %interleaved.vec, ptr %out.vec, align 32