1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck --check-prefixes=AVX2-SLOW %s
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2-FAST,AVX2-FAST-ALL %s
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2-FAST,AVX2-FAST-PERLANE %s
6 ; These patterns are produced by LoopVectorizer for interleaved stores.
8 define void @vf2(<2 x i16>* %in.vecptr0, <2 x i16>* %in.vecptr1, <2 x i16>* %in.vecptr2, <2 x i16>* %in.vecptr3, <8 x i16>* %out.vec) nounwind {
9 ; AVX2-SLOW-LABEL: vf2:
11 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0
12 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm1
13 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
14 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
16 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15]
17 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%r8)
18 ; AVX2-SLOW-NEXT: retq
20 ; AVX2-FAST-LABEL: vf2:
22 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
23 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm1
24 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
25 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
26 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
27 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15]
28 ; AVX2-FAST-NEXT: vmovdqa %xmm0, (%r8)
29 ; AVX2-FAST-NEXT: retq
30 %in.vec0 = load <2 x i16>, <2 x i16>* %in.vecptr0, align 32
31 %in.vec1 = load <2 x i16>, <2 x i16>* %in.vecptr1, align 32
32 %in.vec2 = load <2 x i16>, <2 x i16>* %in.vecptr2, align 32
33 %in.vec3 = load <2 x i16>, <2 x i16>* %in.vecptr3, align 32
35 %concat01 = shufflevector <2 x i16> %in.vec0, <2 x i16> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
36 %concat23 = shufflevector <2 x i16> %in.vec2, <2 x i16> %in.vec3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
37 %concat0123 = shufflevector <4 x i16> %concat01, <4 x i16> %concat23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
38 %interleaved.vec = shufflevector <8 x i16> %concat0123, <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
40 store <8 x i16> %interleaved.vec, <8 x i16>* %out.vec, align 32
45 define void @vf4(<4 x i16>* %in.vecptr0, <4 x i16>* %in.vecptr1, <4 x i16>* %in.vecptr2, <4 x i16>* %in.vecptr3, <16 x i16>* %out.vec) nounwind {
46 ; AVX2-SLOW-LABEL: vf4:
48 ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
49 ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
50 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
51 ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
52 ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
53 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
54 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
55 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,22,23,30,31]
56 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
57 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,20,21,28,29,u,u,u,u,22,23,30,31,u,u,u,u]
58 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7]
59 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8)
60 ; AVX2-SLOW-NEXT: vzeroupper
61 ; AVX2-SLOW-NEXT: retq
63 ; AVX2-FAST-ALL-LABEL: vf4:
64 ; AVX2-FAST-ALL: # %bb.0:
65 ; AVX2-FAST-ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
66 ; AVX2-FAST-ALL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
67 ; AVX2-FAST-ALL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
68 ; AVX2-FAST-ALL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
69 ; AVX2-FAST-ALL-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
70 ; AVX2-FAST-ALL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
71 ; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
72 ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7]
73 ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
74 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31]
75 ; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, (%r8)
76 ; AVX2-FAST-ALL-NEXT: vzeroupper
77 ; AVX2-FAST-ALL-NEXT: retq
79 ; AVX2-FAST-PERLANE-LABEL: vf4:
80 ; AVX2-FAST-PERLANE: # %bb.0:
81 ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
82 ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
83 ; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
84 ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
85 ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
86 ; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
87 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
88 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,22,23,30,31]
89 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
90 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,20,21,28,29,u,u,u,u,22,23,30,31,u,u,u,u]
91 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7]
92 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r8)
93 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
94 ; AVX2-FAST-PERLANE-NEXT: retq
95 %in.vec0 = load <4 x i16>, <4 x i16>* %in.vecptr0, align 32
96 %in.vec1 = load <4 x i16>, <4 x i16>* %in.vecptr1, align 32
97 %in.vec2 = load <4 x i16>, <4 x i16>* %in.vecptr2, align 32
98 %in.vec3 = load <4 x i16>, <4 x i16>* %in.vecptr3, align 32
100 %concat01 = shufflevector <4 x i16> %in.vec0, <4 x i16> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
101 %concat23 = shufflevector <4 x i16> %in.vec2, <4 x i16> %in.vec3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
102 %concat0123 = shufflevector <8 x i16> %concat01, <8 x i16> %concat23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
103 %interleaved.vec = shufflevector <16 x i16> %concat0123, <16 x i16> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
105 store <16 x i16> %interleaved.vec, <16 x i16>* %out.vec, align 32
110 define void @vf8(<8 x i16>* %in.vecptr0, <8 x i16>* %in.vecptr1, <8 x i16>* %in.vecptr2, <8 x i16>* %in.vecptr3, <32 x i16>* %out.vec) nounwind {
111 ; AVX2-SLOW-LABEL: vf8:
112 ; AVX2-SLOW: # %bb.0:
113 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0
114 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm1
115 ; AVX2-SLOW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
116 ; AVX2-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
117 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,2,0,2]
118 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15>
119 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
120 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,0,2]
121 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u>
122 ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm4, %ymm4
123 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7]
124 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3]
125 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1
126 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3]
127 ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm0, %ymm0
128 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
129 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%r8)
130 ; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r8)
131 ; AVX2-SLOW-NEXT: vzeroupper
132 ; AVX2-SLOW-NEXT: retq
134 ; AVX2-FAST-LABEL: vf8:
135 ; AVX2-FAST: # %bb.0:
136 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
137 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm1
138 ; AVX2-FAST-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
139 ; AVX2-FAST-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
140 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,2,0,2]
141 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15>
142 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
143 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,0,2]
144 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u>
145 ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4
146 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7]
147 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3]
148 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1
149 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3]
150 ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0
151 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
152 ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%r8)
153 ; AVX2-FAST-NEXT: vmovdqa %ymm2, (%r8)
154 ; AVX2-FAST-NEXT: vzeroupper
155 ; AVX2-FAST-NEXT: retq
156 %in.vec0 = load <8 x i16>, <8 x i16>* %in.vecptr0, align 32
157 %in.vec1 = load <8 x i16>, <8 x i16>* %in.vecptr1, align 32
158 %in.vec2 = load <8 x i16>, <8 x i16>* %in.vecptr2, align 32
159 %in.vec3 = load <8 x i16>, <8 x i16>* %in.vecptr3, align 32
161 %concat01 = shufflevector <8 x i16> %in.vec0, <8 x i16> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
162 %concat23 = shufflevector <8 x i16> %in.vec2, <8 x i16> %in.vec3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
163 %concat0123 = shufflevector <16 x i16> %concat01, <16 x i16> %concat23, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
164 %interleaved.vec = shufflevector <32 x i16> %concat0123, <32 x i16> poison, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
166 store <32 x i16> %interleaved.vec, <32 x i16>* %out.vec, align 32
171 define void @vf16(<16 x i16>* %in.vecptr0, <16 x i16>* %in.vecptr1, <16 x i16>* %in.vecptr2, <16 x i16>* %in.vecptr3, <64 x i16>* %out.vec) nounwind {
172 ; AVX2-SLOW-LABEL: vf16:
173 ; AVX2-SLOW: # %bb.0:
174 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm5
175 ; AVX2-SLOW-NEXT: vmovdqa 16(%rcx), %xmm8
176 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6
177 ; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm9
178 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
179 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
180 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
181 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
182 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm7
183 ; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm3
184 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0
185 ; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm4
186 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
187 ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm1[0],zero,xmm1[1],zero
188 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
189 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1
190 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
191 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
192 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,0,1,1]
193 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
194 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1
195 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
196 ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm2[0],zero,xmm2[1],zero
197 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
198 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm10, %ymm2
199 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
200 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
201 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1]
202 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
203 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2
204 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
205 ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero
206 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
207 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0
208 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
209 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
210 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1]
211 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
212 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2
213 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
214 ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero
215 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
216 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
217 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
218 ; AVX2-SLOW-NEXT: vmovdqa %ymm2, 96(%r8)
219 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8)
220 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%r8)
221 ; AVX2-SLOW-NEXT: vmovdqa %ymm11, 64(%r8)
222 ; AVX2-SLOW-NEXT: vzeroupper
223 ; AVX2-SLOW-NEXT: retq
225 ; AVX2-FAST-LABEL: vf16:
226 ; AVX2-FAST: # %bb.0:
227 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm5
228 ; AVX2-FAST-NEXT: vmovdqa 16(%rcx), %xmm8
229 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm6
230 ; AVX2-FAST-NEXT: vmovdqa 16(%rdx), %xmm9
231 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
232 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
233 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
234 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
235 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm7
236 ; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm3
237 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
238 ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm4
239 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
240 ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm1[0],zero,xmm1[1],zero
241 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
242 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1
243 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
244 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
245 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,0,1,1]
246 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
247 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1
248 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
249 ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm2[0],zero,xmm2[1],zero
250 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
251 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm10, %ymm2
252 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
253 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
254 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1]
255 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
256 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2
257 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
258 ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero
259 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
260 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0
261 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
262 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
263 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1]
264 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
265 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2
266 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
267 ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero
268 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
269 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
270 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
271 ; AVX2-FAST-NEXT: vmovdqa %ymm2, 96(%r8)
272 ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%r8)
273 ; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%r8)
274 ; AVX2-FAST-NEXT: vmovdqa %ymm11, 64(%r8)
275 ; AVX2-FAST-NEXT: vzeroupper
276 ; AVX2-FAST-NEXT: retq
277 %in.vec0 = load <16 x i16>, <16 x i16>* %in.vecptr0, align 32
278 %in.vec1 = load <16 x i16>, <16 x i16>* %in.vecptr1, align 32
279 %in.vec2 = load <16 x i16>, <16 x i16>* %in.vecptr2, align 32
280 %in.vec3 = load <16 x i16>, <16 x i16>* %in.vecptr3, align 32
282 %concat01 = shufflevector <16 x i16> %in.vec0, <16 x i16> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
283 %concat23 = shufflevector <16 x i16> %in.vec2, <16 x i16> %in.vec3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
284 %concat0123 = shufflevector <32 x i16> %concat01, <32 x i16> %concat23, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
285 %interleaved.vec = shufflevector <64 x i16> %concat0123, <64 x i16> poison, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
287 store <64 x i16> %interleaved.vec, <64 x i16>* %out.vec, align 32
292 define void @vf32(<32 x i16>* %in.vecptr0, <32 x i16>* %in.vecptr1, <32 x i16>* %in.vecptr2, <32 x i16>* %in.vecptr3, <128 x i16>* %out.vec) nounwind {
293 ; AVX2-SLOW-LABEL: vf32:
294 ; AVX2-SLOW: # %bb.0:
295 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm15
296 ; AVX2-SLOW-NEXT: vmovdqa 16(%rcx), %xmm12
297 ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm11
298 ; AVX2-SLOW-NEXT: vmovdqa 48(%rcx), %xmm2
299 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6
300 ; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm13
301 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm1
302 ; AVX2-SLOW-NEXT: vmovdqa 48(%rdx), %xmm7
303 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
304 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,1,1]
305 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
306 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm8
307 ; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm14
308 ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm3
309 ; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm5
310 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4
311 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3]
312 ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero
313 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
314 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm0
315 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7]
316 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
317 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7]
318 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,0,1,1]
319 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
320 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm8
321 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
322 ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm0[0],zero,xmm0[1],zero
323 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
324 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm10, %ymm0
325 ; AVX2-SLOW-NEXT: vmovdqa 48(%rsi), %xmm10
326 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7]
327 ; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm0
328 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3]
329 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,0,1,1]
330 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
331 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm1
332 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
333 ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero
334 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
335 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
336 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7]
337 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7]
338 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
339 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
340 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
341 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
342 ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero
343 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
344 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
345 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm4
346 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7]
347 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1
348 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
349 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
350 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
351 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
352 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
353 ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
354 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
355 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0
356 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
357 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7]
358 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,1,1]
359 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
360 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0
361 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
362 ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero
363 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
364 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm7, %ymm3
365 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7]
366 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3]
367 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,0,1,1]
368 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
369 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3
370 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
371 ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero
372 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
373 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1
374 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7]
375 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
376 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1]
377 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
378 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
379 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7]
380 ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
381 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
382 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
383 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7]
384 ; AVX2-SLOW-NEXT: vmovdqa %ymm3, 96(%r8)
385 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%r8)
386 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%r8)
387 ; AVX2-SLOW-NEXT: vmovdqa %ymm2, 192(%r8)
388 ; AVX2-SLOW-NEXT: vmovdqa %ymm11, 224(%r8)
389 ; AVX2-SLOW-NEXT: vmovdqa %ymm8, 128(%r8)
390 ; AVX2-SLOW-NEXT: vmovdqa %ymm9, 160(%r8)
391 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
392 ; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%r8)
393 ; AVX2-SLOW-NEXT: vzeroupper
394 ; AVX2-SLOW-NEXT: retq
396 ; AVX2-FAST-LABEL: vf32:
397 ; AVX2-FAST: # %bb.0:
398 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm15
399 ; AVX2-FAST-NEXT: vmovdqa 16(%rcx), %xmm12
400 ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm11
401 ; AVX2-FAST-NEXT: vmovdqa 48(%rcx), %xmm2
402 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm6
403 ; AVX2-FAST-NEXT: vmovdqa 16(%rdx), %xmm13
404 ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm1
405 ; AVX2-FAST-NEXT: vmovdqa 48(%rdx), %xmm7
406 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
407 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,1,1]
408 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
409 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm8
410 ; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm14
411 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm3
412 ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm5
413 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm4
414 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3]
415 ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero
416 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
417 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm0
418 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7]
419 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
420 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7]
421 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,0,1,1]
422 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
423 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm8
424 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
425 ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm0[0],zero,xmm0[1],zero
426 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
427 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm10, %ymm0
428 ; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm10
429 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7]
430 ; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm0
431 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3]
432 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,0,1,1]
433 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
434 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm1
435 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
436 ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero
437 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
438 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
439 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7]
440 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7]
441 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
442 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
443 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
444 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
445 ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero
446 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
447 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
448 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm4
449 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7]
450 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1
451 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
452 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
453 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
454 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
455 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
456 ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
457 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
458 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0
459 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
460 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7]
461 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,1,1]
462 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
463 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0
464 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
465 ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero
466 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
467 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm7, %ymm3
468 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7]
469 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3]
470 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,0,1,1]
471 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
472 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3
473 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
474 ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero
475 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
476 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1
477 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7]
478 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
479 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1]
480 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
481 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
482 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7]
483 ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
484 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
485 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
486 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7]
487 ; AVX2-FAST-NEXT: vmovdqa %ymm3, 96(%r8)
488 ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%r8)
489 ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%r8)
490 ; AVX2-FAST-NEXT: vmovdqa %ymm2, 192(%r8)
491 ; AVX2-FAST-NEXT: vmovdqa %ymm11, 224(%r8)
492 ; AVX2-FAST-NEXT: vmovdqa %ymm8, 128(%r8)
493 ; AVX2-FAST-NEXT: vmovdqa %ymm9, 160(%r8)
494 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
495 ; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%r8)
496 ; AVX2-FAST-NEXT: vzeroupper
497 ; AVX2-FAST-NEXT: retq
498 %in.vec0 = load <32 x i16>, <32 x i16>* %in.vecptr0, align 32
499 %in.vec1 = load <32 x i16>, <32 x i16>* %in.vecptr1, align 32
500 %in.vec2 = load <32 x i16>, <32 x i16>* %in.vecptr2, align 32
501 %in.vec3 = load <32 x i16>, <32 x i16>* %in.vecptr3, align 32
503 %concat01 = shufflevector <32 x i16> %in.vec0, <32 x i16> %in.vec1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
504 %concat23 = shufflevector <32 x i16> %in.vec2, <32 x i16> %in.vec3, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
505 %concat0123 = shufflevector <64 x i16> %concat01, <64 x i16> %concat23, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
506 %interleaved.vec = shufflevector <128 x i16> %concat0123, <128 x i16> poison, <128 x i32> <i32 0, i32 32, i32 64, i32 96, i32 1, i32 33, i32 65, i32 97, i32 2, i32 34, i32 66, i32 98, i32 3, i32 35, i32 67, i32 99, i32 4, i32 36, i32 68, i32 100, i32 5, i32 37, i32 69, i32 101, i32 6, i32 38, i32 70, i32 102, i32 7, i32 39, i32 71, i32 103, i32 8, i32 40, i32 72, i32 104, i32 9, i32 41, i32 73, i32 105, i32 10, i32 42, i32 74, i32 106, i32 11, i32 43, i32 75, i32 107, i32 12, i32 44, i32 76, i32 108, i32 13, i32 45, i32 77, i32 109, i32 14, i32 46, i32 78, i32 110, i32 15, i32 47, i32 79, i32 111, i32 16, i32 48, i32 80, i32 112, i32 17, i32 49, i32 81, i32 113, i32 18, i32 50, i32 82, i32 114, i32 19, i32 51, i32 83, i32 115, i32 20, i32 52, i32 84, i32 116, i32 21, i32 53, i32 85, i32 117, i32 22, i32 54, i32 86, i32 118, i32 23, i32 55, i32 87, i32 119, i32 24, i32 56, i32 88, i32 120, i32 25, i32 57, i32 89, i32 121, i32 26, i32 58, i32 90, i32 122, i32 27, i32 59, i32 91, i32 123, i32 28, i32 60, i32 92, i32 124, i32 29, i32 61, i32 93, i32 125, i32 30, i32 62, i32 94, i32 126, i32 31, i32 63, i32 95, i32 127>
508 store <128 x i16> %interleaved.vec, <128 x i16>* %out.vec, align 32