1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX1
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2,AVX2-SLOW
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST-ALL
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST-PERLANE
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512
9 ; These patterns are produced by LoopVectorizer for interleaved loads.
11 define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
12 ; SSE-LABEL: load_i32_stride3_vf2:
14 ; SSE-NEXT: movdqa (%rdi), %xmm0
15 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
16 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
17 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
18 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
19 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
20 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
21 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
22 ; SSE-NEXT: movq %xmm2, (%rsi)
23 ; SSE-NEXT: movq %xmm3, (%rdx)
24 ; SSE-NEXT: movq %xmm0, (%rcx)
27 ; AVX1-LABEL: load_i32_stride3_vf2:
29 ; AVX1-NEXT: vmovaps (%rdi), %xmm0
30 ; AVX1-NEXT: vmovaps 16(%rdi), %xmm1
31 ; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,3,2,3]
32 ; AVX1-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0],xmm0[1],xmm1[2,3]
33 ; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[1,0,2,3]
34 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
35 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
36 ; AVX1-NEXT: vmovlps %xmm2, (%rsi)
37 ; AVX1-NEXT: vmovlps %xmm3, (%rdx)
38 ; AVX1-NEXT: vmovlps %xmm0, (%rcx)
41 ; AVX2-LABEL: load_i32_stride3_vf2:
43 ; AVX2-NEXT: vmovaps (%rdi), %xmm0
44 ; AVX2-NEXT: vmovaps 16(%rdi), %xmm1
45 ; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,3,2,3]
46 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
47 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,2,3]
48 ; AVX2-NEXT: vbroadcastss 8(%rdi), %xmm3
49 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
50 ; AVX2-NEXT: vmovlps %xmm2, (%rsi)
51 ; AVX2-NEXT: vmovlps %xmm0, (%rdx)
52 ; AVX2-NEXT: vmovlps %xmm1, (%rcx)
55 ; AVX512-LABEL: load_i32_stride3_vf2:
57 ; AVX512-NEXT: vmovaps (%rdi), %xmm0
58 ; AVX512-NEXT: vmovaps 16(%rdi), %xmm1
59 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,3,2,3]
60 ; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
61 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,2,3]
62 ; AVX512-NEXT: vbroadcastss 8(%rdi), %xmm3
63 ; AVX512-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
64 ; AVX512-NEXT: vmovlps %xmm2, (%rsi)
65 ; AVX512-NEXT: vmovlps %xmm0, (%rdx)
66 ; AVX512-NEXT: vmovlps %xmm1, (%rcx)
68 %wide.vec = load <6 x i32>, ptr %in.vec, align 32
70 %strided.vec0 = shufflevector <6 x i32> %wide.vec, <6 x i32> poison, <2 x i32> <i32 0, i32 3>
71 %strided.vec1 = shufflevector <6 x i32> %wide.vec, <6 x i32> poison, <2 x i32> <i32 1, i32 4>
72 %strided.vec2 = shufflevector <6 x i32> %wide.vec, <6 x i32> poison, <2 x i32> <i32 2, i32 5>
74 store <2 x i32> %strided.vec0, ptr %out.vec0, align 32
75 store <2 x i32> %strided.vec1, ptr %out.vec1, align 32
76 store <2 x i32> %strided.vec2, ptr %out.vec2, align 32
81 define void @load_i32_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
82 ; SSE-LABEL: load_i32_stride3_vf4:
84 ; SSE-NEXT: movdqa (%rdi), %xmm0
85 ; SSE-NEXT: movaps 16(%rdi), %xmm1
86 ; SSE-NEXT: movaps 32(%rdi), %xmm2
87 ; SSE-NEXT: movdqa %xmm0, %xmm3
88 ; SSE-NEXT: movaps %xmm1, %xmm4
89 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3]
90 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
91 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,1,1]
92 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[1,0]
93 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,2]
94 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[2,3]
95 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
96 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
97 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[0,3]
98 ; SSE-NEXT: movaps %xmm3, (%rsi)
99 ; SSE-NEXT: movaps %xmm0, (%rdx)
100 ; SSE-NEXT: movaps %xmm5, (%rcx)
103 ; AVX1-LABEL: load_i32_stride3_vf4:
105 ; AVX1-NEXT: vmovaps (%rdi), %xmm0
106 ; AVX1-NEXT: vmovaps 16(%rdi), %xmm1
107 ; AVX1-NEXT: vmovaps 32(%rdi), %xmm2
108 ; AVX1-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
109 ; AVX1-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
110 ; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,3,2,1]
111 ; AVX1-NEXT: vmovaps 32(%rdi), %xmm4
112 ; AVX1-NEXT: vblendps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
113 ; AVX1-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0],xmm4[1,2],xmm1[3]
114 ; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,0,3,2]
115 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
116 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
117 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3]
118 ; AVX1-NEXT: vmovaps %xmm3, (%rsi)
119 ; AVX1-NEXT: vmovaps %xmm4, (%rdx)
120 ; AVX1-NEXT: vmovaps %xmm0, (%rcx)
123 ; AVX2-LABEL: load_i32_stride3_vf4:
125 ; AVX2-NEXT: vmovaps (%rdi), %ymm0
126 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm1
127 ; AVX2-NEXT: vmovaps {{.*#+}} xmm2 = [0,3,6,1]
128 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
129 ; AVX2-NEXT: vpermps %ymm3, %ymm2, %ymm2
130 ; AVX2-NEXT: vmovaps {{.*#+}} xmm3 = [1,4,7,2]
131 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
132 ; AVX2-NEXT: vpermps %ymm4, %ymm3, %ymm3
133 ; AVX2-NEXT: vmovaps {{.*#+}} xmm4 = [2,5,0,3]
134 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
135 ; AVX2-NEXT: vpermps %ymm0, %ymm4, %ymm0
136 ; AVX2-NEXT: vmovaps %xmm2, (%rsi)
137 ; AVX2-NEXT: vmovaps %xmm3, (%rdx)
138 ; AVX2-NEXT: vmovaps %xmm0, (%rcx)
139 ; AVX2-NEXT: vzeroupper
142 ; AVX512-LABEL: load_i32_stride3_vf4:
144 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0
145 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm1
146 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,3,6,9]
147 ; AVX512-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
148 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,7,10]
149 ; AVX512-NEXT: vpermi2d %ymm1, %ymm0, %ymm3
150 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [2,5,8,11]
151 ; AVX512-NEXT: vpermi2d %ymm1, %ymm0, %ymm4
152 ; AVX512-NEXT: vmovdqa %xmm2, (%rsi)
153 ; AVX512-NEXT: vmovdqa %xmm3, (%rdx)
154 ; AVX512-NEXT: vmovdqa %xmm4, (%rcx)
155 ; AVX512-NEXT: vzeroupper
157 %wide.vec = load <12 x i32>, ptr %in.vec, align 32
159 %strided.vec0 = shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
160 %strided.vec1 = shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
161 %strided.vec2 = shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
163 store <4 x i32> %strided.vec0, ptr %out.vec0, align 32
164 store <4 x i32> %strided.vec1, ptr %out.vec1, align 32
165 store <4 x i32> %strided.vec2, ptr %out.vec2, align 32
170 define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
171 ; SSE-LABEL: load_i32_stride3_vf8:
173 ; SSE-NEXT: movaps 80(%rdi), %xmm8
174 ; SSE-NEXT: movaps 64(%rdi), %xmm3
175 ; SSE-NEXT: movdqa (%rdi), %xmm1
176 ; SSE-NEXT: movaps 16(%rdi), %xmm6
177 ; SSE-NEXT: movaps 32(%rdi), %xmm10
178 ; SSE-NEXT: movdqa 48(%rdi), %xmm2
179 ; SSE-NEXT: movdqa %xmm1, %xmm11
180 ; SSE-NEXT: movaps %xmm6, %xmm7
181 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
182 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm6[0,0]
183 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,1,1]
184 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm10[1,0]
185 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,3],xmm6[0,2]
186 ; SSE-NEXT: movdqa %xmm2, %xmm6
187 ; SSE-NEXT: movaps %xmm3, %xmm4
188 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
189 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[0,0]
190 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[1,1,1,1]
191 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm8[1,0]
192 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm3[0,2]
193 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm8[2,3]
194 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
195 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm10[2,3]
196 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2]
197 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
198 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm10[0,3]
199 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1]
200 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[0,3]
201 ; SSE-NEXT: movaps %xmm6, 16(%rsi)
202 ; SSE-NEXT: movaps %xmm11, (%rsi)
203 ; SSE-NEXT: movaps %xmm2, 16(%rdx)
204 ; SSE-NEXT: movaps %xmm1, (%rdx)
205 ; SSE-NEXT: movaps %xmm5, 16(%rcx)
206 ; SSE-NEXT: movaps %xmm0, (%rcx)
209 ; AVX1-LABEL: load_i32_stride3_vf8:
211 ; AVX1-NEXT: vmovaps 64(%rdi), %ymm0
212 ; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
213 ; AVX1-NEXT: vmovaps (%rdi), %ymm2
214 ; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
215 ; AVX1-NEXT: vmovaps 16(%rdi), %xmm4
216 ; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1],ymm1[1,3],ymm4[6,5],ymm1[5,7]
217 ; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm4[0,2],ymm3[4,7],ymm4[4,6]
218 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,0,1]
219 ; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,0],ymm4[2,0],ymm0[5,4],ymm4[6,4]
220 ; AVX1-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4]
221 ; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
222 ; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm0[2,0],ymm4[3,0],ymm0[6,4],ymm4[7,4]
223 ; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm4[0,0],ymm5[2,0],ymm4[4,4],ymm5[6,4]
224 ; AVX1-NEXT: vmovaps 16(%rdi), %xmm6
225 ; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
226 ; AVX1-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm6[0,3],ymm7[5,6],ymm6[4,7]
227 ; AVX1-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5]
228 ; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7]
229 ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
230 ; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm2[2,0],ymm6[5,4],ymm2[6,4]
231 ; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[0,3],ymm2[6,4],ymm1[4,7]
232 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,1],ymm0[0,3],ymm4[4,5],ymm0[4,7]
233 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
234 ; AVX1-NEXT: vmovaps %ymm3, (%rsi)
235 ; AVX1-NEXT: vmovaps %ymm5, (%rdx)
236 ; AVX1-NEXT: vmovaps %ymm0, (%rcx)
237 ; AVX1-NEXT: vzeroupper
240 ; AVX2-SLOW-LABEL: load_i32_stride3_vf8:
241 ; AVX2-SLOW: # %bb.0:
242 ; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm0
243 ; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm1
244 ; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm2
245 ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm3 = [21474836482,21474836482,21474836482,21474836482]
246 ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm3, %ymm3
247 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
248 ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u>
249 ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm5, %ymm4
250 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
251 ; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6]
252 ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1]
253 ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm4, %ymm4
254 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
255 ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u>
256 ; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm6, %ymm5
257 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
258 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
259 ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
260 ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm0
261 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7]
262 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
263 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
264 ; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rsi)
265 ; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rdx)
266 ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rcx)
267 ; AVX2-SLOW-NEXT: vzeroupper
268 ; AVX2-SLOW-NEXT: retq
270 ; AVX2-FAST-ALL-LABEL: load_i32_stride3_vf8:
271 ; AVX2-FAST-ALL: # %bb.0:
272 ; AVX2-FAST-ALL-NEXT: vmovaps (%rdi), %ymm0
273 ; AVX2-FAST-ALL-NEXT: vmovaps 32(%rdi), %ymm1
274 ; AVX2-FAST-ALL-NEXT: vmovaps 64(%rdi), %ymm2
275 ; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm3 = [21474836482,21474836482,21474836482,21474836482]
276 ; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm3, %ymm3
277 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
278 ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u>
279 ; AVX2-FAST-ALL-NEXT: vpermps %ymm4, %ymm5, %ymm4
280 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
281 ; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6]
282 ; AVX2-FAST-ALL-NEXT: # ymm4 = mem[0,1,0,1]
283 ; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm4, %ymm4
284 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
285 ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u>
286 ; AVX2-FAST-ALL-NEXT: vpermps %ymm5, %ymm6, %ymm5
287 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
288 ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm5 = <u,u,u,u,u,1,4,7>
289 ; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm5, %ymm2
290 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
291 ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
292 ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
293 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
294 ; AVX2-FAST-ALL-NEXT: vmovaps %ymm3, (%rsi)
295 ; AVX2-FAST-ALL-NEXT: vmovaps %ymm4, (%rdx)
296 ; AVX2-FAST-ALL-NEXT: vmovaps %ymm0, (%rcx)
297 ; AVX2-FAST-ALL-NEXT: vzeroupper
298 ; AVX2-FAST-ALL-NEXT: retq
300 ; AVX2-FAST-PERLANE-LABEL: load_i32_stride3_vf8:
301 ; AVX2-FAST-PERLANE: # %bb.0:
302 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm0
303 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm1
304 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm2
305 ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm3 = [21474836482,21474836482,21474836482,21474836482]
306 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm3, %ymm3
307 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
308 ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u>
309 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm5, %ymm4
310 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
311 ; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6]
312 ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1]
313 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm4, %ymm4
314 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
315 ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u>
316 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm6, %ymm5
317 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
318 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
319 ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
320 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm0
321 ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7]
322 ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
323 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
324 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rsi)
325 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rdx)
326 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rcx)
327 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
328 ; AVX2-FAST-PERLANE-NEXT: retq
330 ; AVX512-LABEL: load_i32_stride3_vf8:
332 ; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
333 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm1
334 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21]
335 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
336 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22]
337 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
338 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23]
339 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm4
340 ; AVX512-NEXT: vmovdqa %ymm2, (%rsi)
341 ; AVX512-NEXT: vmovdqa %ymm3, (%rdx)
342 ; AVX512-NEXT: vmovdqa %ymm4, (%rcx)
343 ; AVX512-NEXT: vzeroupper
345 %wide.vec = load <24 x i32>, ptr %in.vec, align 32
347 %strided.vec0 = shufflevector <24 x i32> %wide.vec, <24 x i32> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
348 %strided.vec1 = shufflevector <24 x i32> %wide.vec, <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
349 %strided.vec2 = shufflevector <24 x i32> %wide.vec, <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
351 store <8 x i32> %strided.vec0, ptr %out.vec0, align 32
352 store <8 x i32> %strided.vec1, ptr %out.vec1, align 32
353 store <8 x i32> %strided.vec2, ptr %out.vec2, align 32
358 define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
359 ; SSE-LABEL: load_i32_stride3_vf16:
361 ; SSE-NEXT: movaps 96(%rdi), %xmm14
362 ; SSE-NEXT: movaps 128(%rdi), %xmm11
363 ; SSE-NEXT: movaps 112(%rdi), %xmm12
364 ; SSE-NEXT: movaps 144(%rdi), %xmm3
365 ; SSE-NEXT: movaps 176(%rdi), %xmm13
366 ; SSE-NEXT: movaps 160(%rdi), %xmm5
367 ; SSE-NEXT: movaps (%rdi), %xmm15
368 ; SSE-NEXT: movaps 16(%rdi), %xmm8
369 ; SSE-NEXT: movaps 32(%rdi), %xmm6
370 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
371 ; SSE-NEXT: movaps 48(%rdi), %xmm10
372 ; SSE-NEXT: movaps 80(%rdi), %xmm9
373 ; SSE-NEXT: movaps 64(%rdi), %xmm2
374 ; SSE-NEXT: movaps %xmm2, %xmm0
375 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[1,0]
376 ; SSE-NEXT: movaps %xmm10, %xmm4
377 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm0[0,2]
378 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
379 ; SSE-NEXT: movaps %xmm5, %xmm0
380 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[1,0]
381 ; SSE-NEXT: movaps %xmm3, %xmm1
382 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
383 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
384 ; SSE-NEXT: movaps %xmm8, %xmm0
385 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[1,0]
386 ; SSE-NEXT: movaps %xmm15, %xmm1
387 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
388 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
389 ; SSE-NEXT: movaps %xmm12, %xmm0
390 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[1,0]
391 ; SSE-NEXT: movaps %xmm14, %xmm1
392 ; SSE-NEXT: movaps %xmm14, %xmm7
393 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
394 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
395 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
396 ; SSE-NEXT: movaps %xmm10, %xmm6
397 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm2[0,0]
398 ; SSE-NEXT: movaps %xmm2, %xmm0
399 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm9[2,3]
400 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[0,2]
401 ; SSE-NEXT: movaps %xmm3, %xmm4
402 ; SSE-NEXT: movaps %xmm3, %xmm14
403 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm5[0,0]
404 ; SSE-NEXT: movaps %xmm5, %xmm0
405 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm13[2,3]
406 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,2]
407 ; SSE-NEXT: movaps %xmm7, %xmm1
408 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm12[0,0]
409 ; SSE-NEXT: movaps %xmm12, %xmm0
410 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm11[2,3]
411 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2]
412 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3]
413 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,0],xmm8[0,0]
414 ; SSE-NEXT: movaps %xmm8, %xmm3
415 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
416 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm7[2,3]
417 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm3[0,2]
418 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1]
419 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,3,2,3]
420 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
421 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0,3]
422 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1]
423 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[2,3,2,3]
424 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
425 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm13[0,3]
426 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,1,1]
427 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
428 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[0,3]
429 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[1,1,1,1]
430 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
431 ; SSE-NEXT: # xmm7 = mem[2,3,2,3]
432 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
433 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm11[0,3]
434 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
435 ; SSE-NEXT: movaps %xmm3, 32(%rsi)
436 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
437 ; SSE-NEXT: movaps %xmm3, (%rsi)
438 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
439 ; SSE-NEXT: movaps %xmm3, 48(%rsi)
440 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
441 ; SSE-NEXT: movaps %xmm3, 16(%rsi)
442 ; SSE-NEXT: movaps %xmm1, 32(%rdx)
443 ; SSE-NEXT: movaps %xmm15, (%rdx)
444 ; SSE-NEXT: movaps %xmm4, 48(%rdx)
445 ; SSE-NEXT: movaps %xmm6, 16(%rdx)
446 ; SSE-NEXT: movaps %xmm7, 32(%rcx)
447 ; SSE-NEXT: movaps %xmm0, (%rcx)
448 ; SSE-NEXT: movaps %xmm5, 48(%rcx)
449 ; SSE-NEXT: movaps %xmm2, 16(%rcx)
452 ; AVX1-LABEL: load_i32_stride3_vf16:
454 ; AVX1-NEXT: vmovaps 64(%rdi), %ymm1
455 ; AVX1-NEXT: vmovaps 32(%rdi), %ymm3
456 ; AVX1-NEXT: vmovaps (%rdi), %ymm5
457 ; AVX1-NEXT: vmovaps 160(%rdi), %ymm0
458 ; AVX1-NEXT: vmovaps 128(%rdi), %ymm2
459 ; AVX1-NEXT: vmovaps 96(%rdi), %ymm4
460 ; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6],ymm2[7]
461 ; AVX1-NEXT: vmovaps 112(%rdi), %xmm7
462 ; AVX1-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,1],ymm2[1,3],ymm7[6,5],ymm2[5,7]
463 ; AVX1-NEXT: vshufps {{.*#+}} ymm7 = ymm6[0,3],ymm7[0,2],ymm6[4,7],ymm7[4,6]
464 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3,0,1]
465 ; AVX1-NEXT: vshufps {{.*#+}} ymm8 = ymm0[1,0],ymm6[2,0],ymm0[5,4],ymm6[6,4]
466 ; AVX1-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4]
467 ; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7]
468 ; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7]
469 ; AVX1-NEXT: vmovaps 16(%rdi), %xmm9
470 ; AVX1-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,1],ymm3[1,3],ymm9[6,5],ymm3[5,7]
471 ; AVX1-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,3],ymm9[0,2],ymm8[4,7],ymm9[4,6]
472 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm1[2,3,0,1]
473 ; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,0],ymm9[2,0],ymm1[5,4],ymm9[6,4]
474 ; AVX1-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
475 ; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7]
476 ; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm1[2,0],ymm9[3,0],ymm1[6,4],ymm9[7,4]
477 ; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm9[0,0],ymm10[2,0],ymm9[4,4],ymm10[6,4]
478 ; AVX1-NEXT: vmovaps 16(%rdi), %xmm11
479 ; AVX1-NEXT: vblendps {{.*#+}} ymm12 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7]
480 ; AVX1-NEXT: vshufps {{.*#+}} ymm12 = ymm12[1,2],ymm11[0,3],ymm12[5,6],ymm11[4,7]
481 ; AVX1-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[0,2,3,1,4,6,7,5]
482 ; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5,6,7]
483 ; AVX1-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,0],ymm6[3,0],ymm0[6,4],ymm6[7,4]
484 ; AVX1-NEXT: vshufps {{.*#+}} ymm12 = ymm6[0,0],ymm12[2,0],ymm6[4,4],ymm12[6,4]
485 ; AVX1-NEXT: vmovaps 112(%rdi), %xmm13
486 ; AVX1-NEXT: vblendps {{.*#+}} ymm14 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7]
487 ; AVX1-NEXT: vshufps {{.*#+}} ymm14 = ymm14[1,2],ymm13[0,3],ymm14[5,6],ymm13[4,7]
488 ; AVX1-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,2,3,1,4,6,7,5]
489 ; AVX1-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5,6,7]
490 ; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7]
491 ; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm11[1,0],ymm5[2,0],ymm11[5,4],ymm5[6,4]
492 ; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm5[2,0],ymm3[0,3],ymm5[6,4],ymm3[4,7]
493 ; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1],ymm1[0,3],ymm9[4,5],ymm1[4,7]
494 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7]
495 ; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7]
496 ; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm13[1,0],ymm3[2,0],ymm13[5,4],ymm3[6,4]
497 ; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,0],ymm2[0,3],ymm3[6,4],ymm2[4,7]
498 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm6[0,1],ymm0[0,3],ymm6[4,5],ymm0[4,7]
499 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
500 ; AVX1-NEXT: vmovaps %ymm8, (%rsi)
501 ; AVX1-NEXT: vmovaps %ymm7, 32(%rsi)
502 ; AVX1-NEXT: vmovaps %ymm12, 32(%rdx)
503 ; AVX1-NEXT: vmovaps %ymm10, (%rdx)
504 ; AVX1-NEXT: vmovaps %ymm0, 32(%rcx)
505 ; AVX1-NEXT: vmovaps %ymm1, (%rcx)
506 ; AVX1-NEXT: vzeroupper
509 ; AVX2-SLOW-LABEL: load_i32_stride3_vf16:
510 ; AVX2-SLOW: # %bb.0:
511 ; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm1
512 ; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm0
513 ; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm2
514 ; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm3
515 ; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm4
516 ; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm5
517 ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm6 = [21474836482,21474836482,21474836482,21474836482]
518 ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm6, %ymm7
519 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
520 ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm9 = <0,3,6,1,4,7,u,u>
521 ; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm9, %ymm8
522 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
523 ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm6, %ymm6
524 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6],ymm1[7]
525 ; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm9, %ymm8
526 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7]
527 ; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm8 = [0,0,3,6,0,0,3,6]
528 ; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,0,1]
529 ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm8, %ymm9
530 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
531 ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm11 = <1,4,7,2,5,u,u,u>
532 ; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm11, %ymm10
533 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5,6,7]
534 ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm8, %ymm8
535 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7]
536 ; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm11, %ymm10
537 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5,6,7]
538 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
539 ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm3 = <2,5,0,3,6,u,u,u>
540 ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm3, %ymm2
541 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,0,3,4,5,4,7]
542 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
543 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7]
544 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7]
545 ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm3, %ymm1
546 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,3,4,5,4,7]
547 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
548 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
549 ; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rsi)
550 ; AVX2-SLOW-NEXT: vmovaps %ymm7, (%rsi)
551 ; AVX2-SLOW-NEXT: vmovaps %ymm8, 32(%rdx)
552 ; AVX2-SLOW-NEXT: vmovaps %ymm9, (%rdx)
553 ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rcx)
554 ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rcx)
555 ; AVX2-SLOW-NEXT: vzeroupper
556 ; AVX2-SLOW-NEXT: retq
558 ; AVX2-FAST-ALL-LABEL: load_i32_stride3_vf16:
559 ; AVX2-FAST-ALL: # %bb.0:
560 ; AVX2-FAST-ALL-NEXT: vmovaps 128(%rdi), %ymm0
561 ; AVX2-FAST-ALL-NEXT: vmovaps 160(%rdi), %ymm1
562 ; AVX2-FAST-ALL-NEXT: vmovaps (%rdi), %ymm2
563 ; AVX2-FAST-ALL-NEXT: vmovaps 32(%rdi), %ymm3
564 ; AVX2-FAST-ALL-NEXT: vmovaps 64(%rdi), %ymm4
565 ; AVX2-FAST-ALL-NEXT: vmovaps 96(%rdi), %ymm5
566 ; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm6 = [21474836482,21474836482,21474836482,21474836482]
567 ; AVX2-FAST-ALL-NEXT: vpermps %ymm4, %ymm6, %ymm7
568 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
569 ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm9 = <0,3,6,1,4,7,u,u>
570 ; AVX2-FAST-ALL-NEXT: vpermps %ymm8, %ymm9, %ymm8
571 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
572 ; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm6, %ymm6
573 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0],ymm0[1],ymm5[2,3],ymm0[4],ymm5[5,6],ymm0[7]
574 ; AVX2-FAST-ALL-NEXT: vpermps %ymm8, %ymm9, %ymm8
575 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7]
576 ; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm8 = [0,0,3,6,0,0,3,6]
577 ; AVX2-FAST-ALL-NEXT: # ymm8 = mem[0,1,0,1]
578 ; AVX2-FAST-ALL-NEXT: vpermps %ymm4, %ymm8, %ymm9
579 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
580 ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm11 = <1,4,7,2,5,u,u,u>
581 ; AVX2-FAST-ALL-NEXT: vpermps %ymm10, %ymm11, %ymm10
582 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5,6,7]
583 ; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm8, %ymm8
584 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7]
585 ; AVX2-FAST-ALL-NEXT: vpermps %ymm10, %ymm11, %ymm10
586 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5,6,7]
587 ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm10 = [0,1,0,3,0,1,4,7]
588 ; AVX2-FAST-ALL-NEXT: vpermps %ymm4, %ymm10, %ymm4
589 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
590 ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm3 = <2,5,0,3,6,u,u,u>
591 ; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm3, %ymm2
592 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7]
593 ; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm10, %ymm1
594 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7]
595 ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm3, %ymm0
596 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
597 ; AVX2-FAST-ALL-NEXT: vmovaps %ymm6, 32(%rsi)
598 ; AVX2-FAST-ALL-NEXT: vmovaps %ymm7, (%rsi)
599 ; AVX2-FAST-ALL-NEXT: vmovaps %ymm8, 32(%rdx)
600 ; AVX2-FAST-ALL-NEXT: vmovaps %ymm9, (%rdx)
601 ; AVX2-FAST-ALL-NEXT: vmovaps %ymm0, 32(%rcx)
602 ; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, (%rcx)
603 ; AVX2-FAST-ALL-NEXT: vzeroupper
604 ; AVX2-FAST-ALL-NEXT: retq
606 ; AVX2-FAST-PERLANE-LABEL: load_i32_stride3_vf16:
607 ; AVX2-FAST-PERLANE: # %bb.0:
608 ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm1
609 ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm0
610 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm2
611 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3
612 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm4
613 ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm5
614 ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm6 = [21474836482,21474836482,21474836482,21474836482]
615 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm6, %ymm7
616 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
617 ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm9 = <0,3,6,1,4,7,u,u>
618 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm9, %ymm8
619 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
620 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm6, %ymm6
621 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6],ymm1[7]
622 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm9, %ymm8
623 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7]
624 ; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm8 = [0,0,3,6,0,0,3,6]
625 ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,0,1]
626 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm8, %ymm9
627 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
628 ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm11 = <1,4,7,2,5,u,u,u>
629 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm11, %ymm10
630 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5,6,7]
631 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm8, %ymm8
632 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7]
633 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm11, %ymm10
634 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5,6,7]
635 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
636 ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm3 = <2,5,0,3,6,u,u,u>
637 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm3, %ymm2
638 ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,0,3,4,5,4,7]
639 ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
640 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7]
641 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7]
642 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm3, %ymm1
643 ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,3,4,5,4,7]
644 ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
645 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
646 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rsi)
647 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, (%rsi)
648 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 32(%rdx)
649 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, (%rdx)
650 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rcx)
651 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx)
652 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
653 ; AVX2-FAST-PERLANE-NEXT: retq
655 ; AVX512-LABEL: load_i32_stride3_vf16:
657 ; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
658 ; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm1
659 ; AVX512-NEXT: vmovdqu64 128(%rdi), %zmm2
660 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u>
661 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
662 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
663 ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm4
664 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = <17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u>
665 ; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm3
666 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
667 ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm5
668 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = <2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u>
669 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
670 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
671 ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm0
672 ; AVX512-NEXT: vmovdqu64 %zmm4, (%rsi)
673 ; AVX512-NEXT: vmovdqu64 %zmm5, (%rdx)
674 ; AVX512-NEXT: vmovdqu64 %zmm0, (%rcx)
675 ; AVX512-NEXT: vzeroupper
677 %wide.vec = load <48 x i32>, ptr %in.vec, align 32
679 %strided.vec0 = shufflevector <48 x i32> %wide.vec, <48 x i32> poison, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
680 %strided.vec1 = shufflevector <48 x i32> %wide.vec, <48 x i32> poison, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
681 %strided.vec2 = shufflevector <48 x i32> %wide.vec, <48 x i32> poison, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
683 store <16 x i32> %strided.vec0, ptr %out.vec0, align 32
684 store <16 x i32> %strided.vec1, ptr %out.vec1, align 32
685 store <16 x i32> %strided.vec2, ptr %out.vec2, align 32
690 define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
691 ; SSE-LABEL: load_i32_stride3_vf32:
693 ; SSE-NEXT: subq $344, %rsp # imm = 0x158
694 ; SSE-NEXT: movaps 336(%rdi), %xmm1
695 ; SSE-NEXT: movaps 368(%rdi), %xmm9
696 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
697 ; SSE-NEXT: movaps 352(%rdi), %xmm14
698 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
699 ; SSE-NEXT: movaps 240(%rdi), %xmm15
700 ; SSE-NEXT: movaps 272(%rdi), %xmm13
701 ; SSE-NEXT: movaps 256(%rdi), %xmm7
702 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
703 ; SSE-NEXT: movaps 144(%rdi), %xmm3
704 ; SSE-NEXT: movaps 176(%rdi), %xmm12
705 ; SSE-NEXT: movaps 160(%rdi), %xmm10
706 ; SSE-NEXT: movaps 48(%rdi), %xmm5
707 ; SSE-NEXT: movaps 80(%rdi), %xmm6
708 ; SSE-NEXT: movaps 64(%rdi), %xmm8
709 ; SSE-NEXT: movaps %xmm8, %xmm0
710 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[1,0]
711 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
712 ; SSE-NEXT: movaps %xmm5, %xmm2
713 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
714 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2]
715 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
716 ; SSE-NEXT: movaps %xmm10, %xmm0
717 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm12[1,0]
718 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
719 ; SSE-NEXT: movaps %xmm3, %xmm2
720 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
721 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2]
722 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
723 ; SSE-NEXT: movaps %xmm7, %xmm0
724 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[1,0]
725 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
726 ; SSE-NEXT: movaps %xmm15, %xmm2
727 ; SSE-NEXT: movaps %xmm15, %xmm4
728 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
729 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2]
730 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
731 ; SSE-NEXT: movaps %xmm14, %xmm0
732 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[1,0]
733 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
734 ; SSE-NEXT: movaps %xmm1, %xmm2
735 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2]
736 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
737 ; SSE-NEXT: movaps 16(%rdi), %xmm7
738 ; SSE-NEXT: movaps 32(%rdi), %xmm2
739 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
740 ; SSE-NEXT: movaps %xmm7, %xmm0
741 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,0]
742 ; SSE-NEXT: movaps (%rdi), %xmm11
743 ; SSE-NEXT: movaps %xmm11, %xmm2
744 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2]
745 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
746 ; SSE-NEXT: movaps 128(%rdi), %xmm2
747 ; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill
748 ; SSE-NEXT: movaps 112(%rdi), %xmm14
749 ; SSE-NEXT: movaps %xmm14, %xmm0
750 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,0]
751 ; SSE-NEXT: movaps 96(%rdi), %xmm2
752 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
753 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2]
754 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
755 ; SSE-NEXT: movaps 224(%rdi), %xmm2
756 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
757 ; SSE-NEXT: movaps 208(%rdi), %xmm9
758 ; SSE-NEXT: movaps %xmm9, %xmm0
759 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,0]
760 ; SSE-NEXT: movaps 192(%rdi), %xmm2
761 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
762 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2]
763 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
764 ; SSE-NEXT: movaps 320(%rdi), %xmm15
765 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
766 ; SSE-NEXT: movaps 304(%rdi), %xmm0
767 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
768 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[1,0]
769 ; SSE-NEXT: movaps 288(%rdi), %xmm2
770 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
771 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2]
772 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
773 ; SSE-NEXT: movaps %xmm5, %xmm2
774 ; SSE-NEXT: movaps %xmm8, %xmm5
775 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm8[0,0]
776 ; SSE-NEXT: movaps %xmm8, %xmm0
777 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm6[2,3]
778 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[0,2]
779 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
780 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
781 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm10[0,0]
782 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm12[2,3]
783 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm10[0,2]
784 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
785 ; SSE-NEXT: movaps %xmm4, %xmm15
786 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
787 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,0],xmm0[0,0]
788 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm13[2,3]
789 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm0[0,2]
790 ; SSE-NEXT: movaps %xmm1, %xmm12
791 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
792 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm0[0,0]
793 ; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
794 ; SSE-NEXT: # xmm0 = xmm0[3,1],mem[2,3]
795 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,2]
796 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
797 ; SSE-NEXT: movaps %xmm6, %xmm10
798 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm14[0,0]
799 ; SSE-NEXT: movaps %xmm14, %xmm0
800 ; SSE-NEXT: movaps (%rsp), %xmm8 # 16-byte Reload
801 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm8[2,3]
802 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,2]
803 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
804 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm9[0,0]
805 ; SSE-NEXT: movaps %xmm9, %xmm0
806 ; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
807 ; SSE-NEXT: # xmm0 = xmm0[3,1],mem[2,3]
808 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,2]
809 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
810 ; SSE-NEXT: movaps %xmm0, %xmm1
811 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
812 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0]
813 ; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
814 ; SSE-NEXT: # xmm2 = xmm2[3,1],mem[2,3]
815 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
816 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[2,3,2,3]
817 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm7[0,0]
818 ; SSE-NEXT: movaps %xmm7, %xmm4
819 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
820 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm13[2,3]
821 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm4[0,2]
822 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,1,1]
823 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
824 ; SSE-NEXT: # xmm5 = mem[2,3,2,3]
825 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
826 ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
827 ; SSE-NEXT: # xmm5 = xmm5[0,1],mem[0,3]
828 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,1,1]
829 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
830 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm13[0,3]
831 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
832 ; SSE-NEXT: # xmm4 = mem[1,1,1,1]
833 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
834 ; SSE-NEXT: # xmm13 = mem[2,3,2,3]
835 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1]
836 ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
837 ; SSE-NEXT: # xmm13 = xmm13[0,1],mem[0,3]
838 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[1,1,1,1]
839 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[2,3,2,3]
840 ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1]
841 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm8[0,3]
842 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
843 ; SSE-NEXT: # xmm7 = mem[1,1,1,1]
844 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
845 ; SSE-NEXT: # xmm4 = mem[2,3,2,3]
846 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
847 ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
848 ; SSE-NEXT: # xmm4 = xmm4[0,1],mem[0,3]
849 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[1,1,1,1]
850 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
851 ; SSE-NEXT: # xmm9 = mem[2,3,2,3]
852 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
853 ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
854 ; SSE-NEXT: # xmm9 = xmm9[0,1],mem[0,3]
855 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
856 ; SSE-NEXT: # xmm8 = mem[1,1,1,1]
857 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
858 ; SSE-NEXT: # xmm7 = mem[2,3,2,3]
859 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
860 ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
861 ; SSE-NEXT: # xmm7 = xmm7[0,1],mem[0,3]
862 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
863 ; SSE-NEXT: # xmm8 = mem[1,1,1,1]
864 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
865 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
866 ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
867 ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,3]
868 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
869 ; SSE-NEXT: movaps %xmm6, 96(%rsi)
870 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
871 ; SSE-NEXT: movaps %xmm6, 64(%rsi)
872 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
873 ; SSE-NEXT: movaps %xmm6, 32(%rsi)
874 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
875 ; SSE-NEXT: movaps %xmm6, (%rsi)
876 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
877 ; SSE-NEXT: movaps %xmm6, 112(%rsi)
878 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
879 ; SSE-NEXT: movaps %xmm6, 80(%rsi)
880 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
881 ; SSE-NEXT: movaps %xmm6, 48(%rsi)
882 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
883 ; SSE-NEXT: movaps %xmm6, 16(%rsi)
884 ; SSE-NEXT: movaps %xmm1, 96(%rdx)
885 ; SSE-NEXT: movaps %xmm3, 64(%rdx)
886 ; SSE-NEXT: movaps %xmm10, 32(%rdx)
887 ; SSE-NEXT: movaps %xmm11, (%rdx)
888 ; SSE-NEXT: movaps %xmm12, 112(%rdx)
889 ; SSE-NEXT: movaps %xmm15, 80(%rdx)
890 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
891 ; SSE-NEXT: movaps %xmm1, 48(%rdx)
892 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
893 ; SSE-NEXT: movaps %xmm1, 16(%rdx)
894 ; SSE-NEXT: movaps %xmm0, 96(%rcx)
895 ; SSE-NEXT: movaps %xmm7, 112(%rcx)
896 ; SSE-NEXT: movaps %xmm9, 64(%rcx)
897 ; SSE-NEXT: movaps %xmm4, 80(%rcx)
898 ; SSE-NEXT: movaps %xmm14, 32(%rcx)
899 ; SSE-NEXT: movaps %xmm13, 48(%rcx)
900 ; SSE-NEXT: movaps %xmm2, (%rcx)
901 ; SSE-NEXT: movaps %xmm5, 16(%rcx)
902 ; SSE-NEXT: addq $344, %rsp # imm = 0x158
905 ; AVX1-LABEL: load_i32_stride3_vf32:
907 ; AVX1-NEXT: subq $456, %rsp # imm = 0x1C8
908 ; AVX1-NEXT: vmovaps 352(%rdi), %ymm3
909 ; AVX1-NEXT: vmovaps 320(%rdi), %ymm9
910 ; AVX1-NEXT: vmovaps 288(%rdi), %ymm4
911 ; AVX1-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
912 ; AVX1-NEXT: vmovaps 64(%rdi), %ymm6
913 ; AVX1-NEXT: vmovaps 32(%rdi), %ymm2
914 ; AVX1-NEXT: vmovaps (%rdi), %ymm8
915 ; AVX1-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
916 ; AVX1-NEXT: vmovaps 160(%rdi), %ymm10
917 ; AVX1-NEXT: vmovaps 128(%rdi), %ymm7
918 ; AVX1-NEXT: vmovaps 96(%rdi), %ymm0
919 ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
920 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3],ymm7[4],ymm0[5,6],ymm7[7]
921 ; AVX1-NEXT: vmovaps 112(%rdi), %xmm1
922 ; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm7[1,3],ymm1[6,5],ymm7[5,7]
923 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
924 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm10[2,3,0,1]
925 ; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,0],ymm5[2,0],ymm10[5,4],ymm5[6,4]
926 ; AVX1-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
927 ; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
928 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
929 ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
930 ; AVX1-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
931 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm2[1],ymm8[2,3],ymm2[4],ymm8[5,6],ymm2[7]
932 ; AVX1-NEXT: vmovaps 16(%rdi), %xmm1
933 ; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm2[1,3],ymm1[6,5],ymm2[5,7]
934 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
935 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm6[2,3,0,1]
936 ; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,0],ymm13[2,0],ymm6[5,4],ymm13[6,4]
937 ; AVX1-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
938 ; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
939 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
940 ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
941 ; AVX1-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
942 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm9[1],ymm4[2,3],ymm9[4],ymm4[5,6],ymm9[7]
943 ; AVX1-NEXT: vmovaps 304(%rdi), %xmm1
944 ; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm9[1,3],ymm1[6,5],ymm9[5,7]
945 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
946 ; AVX1-NEXT: vmovaps %ymm3, %ymm4
947 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm3[2,3,0,1]
948 ; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,0],ymm15[2,0],ymm3[5,4],ymm15[6,4]
949 ; AVX1-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
950 ; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
951 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
952 ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
953 ; AVX1-NEXT: vmovaps 224(%rdi), %ymm14
954 ; AVX1-NEXT: vmovaps 208(%rdi), %xmm0
955 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm14[1,3],ymm0[6,5],ymm14[5,7]
956 ; AVX1-NEXT: vmovaps 192(%rdi), %ymm2
957 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm14[1],ymm2[2,3],ymm14[4],ymm2[5,6],ymm14[7]
958 ; AVX1-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
959 ; AVX1-NEXT: vshufps {{.*#+}} ymm8 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6]
960 ; AVX1-NEXT: vmovaps 256(%rdi), %ymm11
961 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm11[2,3,0,1]
962 ; AVX1-NEXT: vshufps {{.*#+}} ymm12 = ymm11[1,0],ymm9[2,0],ymm11[5,4],ymm9[6,4]
963 ; AVX1-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4]
964 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm12[6,7]
965 ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
966 ; AVX1-NEXT: vshufps {{.*#+}} ymm8 = ymm11[2,0],ymm9[3,0],ymm11[6,4],ymm9[7,4]
967 ; AVX1-NEXT: vshufps {{.*#+}} ymm12 = ymm9[0,0],ymm8[2,0],ymm9[4,4],ymm8[6,4]
968 ; AVX1-NEXT: vmovaps 208(%rdi), %xmm0
969 ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
970 ; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1],ymm14[2],ymm2[3,4],ymm14[5],ymm2[6,7]
971 ; AVX1-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,2],ymm0[0,3],ymm8[5,6],ymm0[4,7]
972 ; AVX1-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,2,3,1,4,6,7,5]
973 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm12[5,6,7]
974 ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
975 ; AVX1-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
976 ; AVX1-NEXT: vshufps {{.*#+}} ymm8 = ymm10[2,0],ymm5[3,0],ymm10[6,4],ymm5[7,4]
977 ; AVX1-NEXT: vshufps {{.*#+}} ymm8 = ymm5[0,0],ymm8[2,0],ymm5[4,4],ymm8[6,4]
978 ; AVX1-NEXT: vmovaps 112(%rdi), %xmm12
979 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
980 ; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1],ymm7[2],ymm5[3,4],ymm7[5],ymm5[6,7]
981 ; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,2],ymm12[0,3],ymm10[5,6],ymm12[4,7]
982 ; AVX1-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,2,3,1,4,6,7,5]
983 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm8[5,6,7]
984 ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
985 ; AVX1-NEXT: vshufps {{.*#+}} ymm8 = ymm6[2,0],ymm13[3,0],ymm6[6,4],ymm13[7,4]
986 ; AVX1-NEXT: vshufps {{.*#+}} ymm8 = ymm13[0,0],ymm8[2,0],ymm13[4,4],ymm8[6,4]
987 ; AVX1-NEXT: vmovaps 16(%rdi), %xmm10
988 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
989 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
990 ; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
991 ; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,2],ymm10[0,3],ymm6[5,6],ymm10[4,7]
992 ; AVX1-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[0,2,3,1,4,6,7,5]
993 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm8[5,6,7]
994 ; AVX1-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
995 ; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm4[2,0],ymm15[3,0],ymm4[6,4],ymm15[7,4]
996 ; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,0],ymm6[2,0],ymm15[4,4],ymm6[6,4]
997 ; AVX1-NEXT: vmovaps 304(%rdi), %xmm8
998 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
999 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1000 ; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1001 ; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,2],ymm8[0,3],ymm4[5,6],ymm8[4,7]
1002 ; AVX1-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,2,3,1,4,6,7,5]
1003 ; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7]
1004 ; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7]
1005 ; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm12[1,0],ymm6[2,0],ymm12[5,4],ymm6[6,4]
1006 ; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0],ymm7[0,3],ymm6[6,4],ymm7[4,7]
1007 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
1008 ; AVX1-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
1009 ; AVX1-NEXT: # ymm7 = ymm5[0,1],mem[0,3],ymm5[4,5],mem[4,7]
1010 ; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7]
1011 ; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
1012 ; AVX1-NEXT: vshufps {{.*#+}} ymm7 = ymm10[1,0],ymm7[2,0],ymm10[5,4],ymm7[6,4]
1013 ; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm7[2,0],ymm2[0,3],ymm7[6,4],ymm2[4,7]
1014 ; AVX1-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload
1015 ; AVX1-NEXT: # ymm7 = ymm13[0,1],mem[0,3],ymm13[4,5],mem[4,7]
1016 ; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm7[5,6,7]
1017 ; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
1018 ; AVX1-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,0],ymm7[2,0],ymm8[5,4],ymm7[6,4]
1019 ; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm7[2,0],ymm1[0,3],ymm7[6,4],ymm1[4,7]
1020 ; AVX1-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload
1021 ; AVX1-NEXT: # ymm7 = ymm15[0,1],mem[0,3],ymm15[4,5],mem[4,7]
1022 ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7]
1023 ; AVX1-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload
1024 ; AVX1-NEXT: # ymm3 = ymm14[0,1],mem[2],ymm14[3,4],mem[5],ymm14[6,7]
1025 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1026 ; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm0[1,0],ymm3[2,0],ymm0[5,4],ymm3[6,4]
1027 ; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm14[0,3],ymm3[6,4],ymm14[4,7]
1028 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm9[0,1],ymm11[0,3],ymm9[4,5],ymm11[4,7]
1029 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
1030 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1031 ; AVX1-NEXT: vmovaps %ymm1, 64(%rsi)
1032 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1033 ; AVX1-NEXT: vmovaps %ymm1, 96(%rsi)
1034 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1035 ; AVX1-NEXT: vmovaps %ymm1, (%rsi)
1036 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1037 ; AVX1-NEXT: vmovaps %ymm1, 32(%rsi)
1038 ; AVX1-NEXT: vmovaps %ymm4, 96(%rdx)
1039 ; AVX1-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
1040 ; AVX1-NEXT: vmovaps %ymm1, (%rdx)
1041 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1042 ; AVX1-NEXT: vmovaps %ymm1, 32(%rdx)
1043 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1044 ; AVX1-NEXT: vmovaps %ymm1, 64(%rdx)
1045 ; AVX1-NEXT: vmovaps %ymm0, 64(%rcx)
1046 ; AVX1-NEXT: vmovaps %ymm2, 96(%rcx)
1047 ; AVX1-NEXT: vmovaps %ymm5, (%rcx)
1048 ; AVX1-NEXT: vmovaps %ymm6, 32(%rcx)
1049 ; AVX1-NEXT: addq $456, %rsp # imm = 0x1C8
1050 ; AVX1-NEXT: vzeroupper
1053 ; AVX2-SLOW-LABEL: load_i32_stride3_vf32:
1054 ; AVX2-SLOW: # %bb.0:
1055 ; AVX2-SLOW-NEXT: subq $136, %rsp
1056 ; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm15
1057 ; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm3
1058 ; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm2
1059 ; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm1
1060 ; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm6
1061 ; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm10
1062 ; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm4
1063 ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1064 ; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm7
1065 ; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm0
1066 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1067 ; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm14
1068 ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm12 = [21474836482,21474836482,21474836482,21474836482]
1069 ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm12, %ymm0
1070 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5,6],ymm7[7]
1071 ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm13 = <0,3,6,1,4,7,u,u>
1072 ; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm13, %ymm5
1073 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
1074 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1075 ; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm12, %ymm0
1076 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0],ymm6[1],ymm14[2,3],ymm6[4],ymm14[5,6],ymm6[7]
1077 ; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm13, %ymm5
1078 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
1079 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1080 ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm12, %ymm0
1081 ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1082 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
1083 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1084 ; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm13, %ymm9
1085 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7]
1086 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1087 ; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm11
1088 ; AVX2-SLOW-NEXT: vmovaps %ymm15, %ymm4
1089 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm11[0],ymm15[1],ymm11[2,3],ymm15[4],ymm11[5,6],ymm15[7]
1090 ; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm13, %ymm13
1091 ; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm8
1092 ; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm12, %ymm12
1093 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm12[6,7]
1094 ; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
1095 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
1096 ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm3 = <1,4,7,2,5,u,u,u>
1097 ; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm3, %ymm12
1098 ; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6]
1099 ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1]
1100 ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm0, %ymm15
1101 ; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm13
1102 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm15[5,6,7]
1103 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0,1],ymm6[2],ymm14[3,4],ymm6[5],ymm14[6,7]
1104 ; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm3, %ymm15
1105 ; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm0, %ymm5
1106 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4],ymm5[5,6,7]
1107 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1108 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm7[2],ymm1[3,4],ymm7[5],ymm1[6,7]
1109 ; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm3, %ymm5
1110 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1111 ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm9
1112 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm9[5,6,7]
1113 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1],ymm4[2],ymm11[3,4],ymm4[5],ymm11[6,7]
1114 ; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm3, %ymm3
1115 ; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm0, %ymm0
1116 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
1117 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm14[2],ymm6[3,4],ymm14[5],ymm6[6,7]
1118 ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm9 = <2,5,0,3,6,u,u,u>
1119 ; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm9, %ymm3
1120 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,0,3,4,5,4,7]
1121 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3]
1122 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm10[5,6,7]
1123 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7]
1124 ; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm9, %ymm6
1125 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm2[0,1,0,3,4,5,4,7]
1126 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
1127 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7]
1128 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm11[2],ymm4[3,4],ymm11[5],ymm4[6,7]
1129 ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm9, %ymm2
1130 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[0,1,0,3,4,5,4,7]
1131 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
1132 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
1133 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1134 ; AVX2-SLOW-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
1135 ; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7]
1136 ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm9, %ymm2
1137 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm13[0,1,0,3,4,5,4,7]
1138 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
1139 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7]
1140 ; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload
1141 ; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rsi)
1142 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
1143 ; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rsi)
1144 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
1145 ; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rsi)
1146 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
1147 ; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rsi)
1148 ; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rdx)
1149 ; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rdx)
1150 ; AVX2-SLOW-NEXT: vmovaps %ymm15, 32(%rdx)
1151 ; AVX2-SLOW-NEXT: vmovaps %ymm12, 64(%rdx)
1152 ; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rcx)
1153 ; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rcx)
1154 ; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rcx)
1155 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rcx)
1156 ; AVX2-SLOW-NEXT: addq $136, %rsp
1157 ; AVX2-SLOW-NEXT: vzeroupper
1158 ; AVX2-SLOW-NEXT: retq
1160 ; AVX2-FAST-ALL-LABEL: load_i32_stride3_vf32:
1161 ; AVX2-FAST-ALL: # %bb.0:
1162 ; AVX2-FAST-ALL-NEXT: subq $72, %rsp
1163 ; AVX2-FAST-ALL-NEXT: vmovaps 320(%rdi), %ymm1
1164 ; AVX2-FAST-ALL-NEXT: vmovaps 224(%rdi), %ymm0
1165 ; AVX2-FAST-ALL-NEXT: vmovaps 192(%rdi), %ymm2
1166 ; AVX2-FAST-ALL-NEXT: vmovaps 256(%rdi), %ymm3
1167 ; AVX2-FAST-ALL-NEXT: vmovaps 128(%rdi), %ymm9
1168 ; AVX2-FAST-ALL-NEXT: vmovaps 160(%rdi), %ymm10
1169 ; AVX2-FAST-ALL-NEXT: vmovaps (%rdi), %ymm5
1170 ; AVX2-FAST-ALL-NEXT: vmovaps 32(%rdi), %ymm7
1171 ; AVX2-FAST-ALL-NEXT: vmovaps 64(%rdi), %ymm8
1172 ; AVX2-FAST-ALL-NEXT: vmovaps 96(%rdi), %ymm12
1173 ; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm4 = [21474836482,21474836482,21474836482,21474836482]
1174 ; AVX2-FAST-ALL-NEXT: vpermps %ymm8, %ymm4, %ymm6
1175 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm11 = ymm5[0],ymm7[1],ymm5[2,3],ymm7[4],ymm5[5,6],ymm7[7]
1176 ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm13 = <0,3,6,1,4,7,u,u>
1177 ; AVX2-FAST-ALL-NEXT: vpermps %ymm11, %ymm13, %ymm11
1178 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7]
1179 ; AVX2-FAST-ALL-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1180 ; AVX2-FAST-ALL-NEXT: vpermps %ymm10, %ymm4, %ymm6
1181 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0],ymm9[1],ymm12[2,3],ymm9[4],ymm12[5,6],ymm9[7]
1182 ; AVX2-FAST-ALL-NEXT: vpermps %ymm11, %ymm13, %ymm11
1183 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7]
1184 ; AVX2-FAST-ALL-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill
1185 ; AVX2-FAST-ALL-NEXT: vmovaps %ymm3, %ymm6
1186 ; AVX2-FAST-ALL-NEXT: vpermps %ymm3, %ymm4, %ymm11
1187 ; AVX2-FAST-ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1188 ; AVX2-FAST-ALL-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1189 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm14 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
1190 ; AVX2-FAST-ALL-NEXT: vpermps %ymm14, %ymm13, %ymm14
1191 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5],ymm11[6,7]
1192 ; AVX2-FAST-ALL-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1193 ; AVX2-FAST-ALL-NEXT: vmovaps 288(%rdi), %ymm15
1194 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5,6],ymm1[7]
1195 ; AVX2-FAST-ALL-NEXT: vmovaps %ymm1, %ymm3
1196 ; AVX2-FAST-ALL-NEXT: vpermps %ymm14, %ymm13, %ymm13
1197 ; AVX2-FAST-ALL-NEXT: vmovaps 352(%rdi), %ymm1
1198 ; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm4, %ymm4
1199 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3,4,5],ymm4[6,7]
1200 ; AVX2-FAST-ALL-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1201 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
1202 ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = <1,4,7,2,5,u,u,u>
1203 ; AVX2-FAST-ALL-NEXT: vpermps %ymm4, %ymm2, %ymm4
1204 ; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6]
1205 ; AVX2-FAST-ALL-NEXT: # ymm0 = mem[0,1,0,1]
1206 ; AVX2-FAST-ALL-NEXT: vpermps %ymm6, %ymm0, %ymm13
1207 ; AVX2-FAST-ALL-NEXT: vmovaps %ymm6, %ymm14
1208 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm13 = ymm4[0,1,2,3,4],ymm13[5,6,7]
1209 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1],ymm9[2],ymm12[3,4],ymm9[5],ymm12[6,7]
1210 ; AVX2-FAST-ALL-NEXT: vpermps %ymm4, %ymm2, %ymm4
1211 ; AVX2-FAST-ALL-NEXT: vpermps %ymm10, %ymm0, %ymm6
1212 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7]
1213 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1],ymm7[2],ymm5[3,4],ymm7[5],ymm5[6,7]
1214 ; AVX2-FAST-ALL-NEXT: vpermps %ymm6, %ymm2, %ymm6
1215 ; AVX2-FAST-ALL-NEXT: vpermps %ymm8, %ymm0, %ymm11
1216 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm11[5,6,7]
1217 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm3[2],ymm15[3,4],ymm3[5],ymm15[6,7]
1218 ; AVX2-FAST-ALL-NEXT: vpermps %ymm11, %ymm2, %ymm2
1219 ; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm0, %ymm0
1220 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
1221 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7]
1222 ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm9 = [0,1,0,3,0,1,4,7]
1223 ; AVX2-FAST-ALL-NEXT: vpermps %ymm10, %ymm9, %ymm10
1224 ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm11 = <2,5,0,3,6,u,u,u>
1225 ; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm11, %ymm2
1226 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5,6,7]
1227 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7]
1228 ; AVX2-FAST-ALL-NEXT: vpermps %ymm8, %ymm9, %ymm7
1229 ; AVX2-FAST-ALL-NEXT: vpermps %ymm5, %ymm11, %ymm5
1230 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm7[5,6,7]
1231 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1],ymm15[2],ymm3[3,4],ymm15[5],ymm3[6,7]
1232 ; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm9, %ymm1
1233 ; AVX2-FAST-ALL-NEXT: vpermps %ymm7, %ymm11, %ymm7
1234 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4],ymm1[5,6,7]
1235 ; AVX2-FAST-ALL-NEXT: vpermps %ymm14, %ymm9, %ymm3
1236 ; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
1237 ; AVX2-FAST-ALL-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
1238 ; AVX2-FAST-ALL-NEXT: # ymm7 = ymm7[0,1],mem[2],ymm7[3,4],mem[5],ymm7[6,7]
1239 ; AVX2-FAST-ALL-NEXT: vpermps %ymm7, %ymm11, %ymm7
1240 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5,6,7]
1241 ; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
1242 ; AVX2-FAST-ALL-NEXT: vmovaps %ymm7, 96(%rsi)
1243 ; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
1244 ; AVX2-FAST-ALL-NEXT: vmovaps %ymm7, 64(%rsi)
1245 ; AVX2-FAST-ALL-NEXT: vmovups (%rsp), %ymm7 # 32-byte Reload
1246 ; AVX2-FAST-ALL-NEXT: vmovaps %ymm7, 32(%rsi)
1247 ; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
1248 ; AVX2-FAST-ALL-NEXT: vmovaps %ymm7, (%rsi)
1249 ; AVX2-FAST-ALL-NEXT: vmovaps %ymm0, 96(%rdx)
1250 ; AVX2-FAST-ALL-NEXT: vmovaps %ymm6, (%rdx)
1251 ; AVX2-FAST-ALL-NEXT: vmovaps %ymm4, 32(%rdx)
1252 ; AVX2-FAST-ALL-NEXT: vmovaps %ymm13, 64(%rdx)
1253 ; AVX2-FAST-ALL-NEXT: vmovaps %ymm3, 64(%rcx)
1254 ; AVX2-FAST-ALL-NEXT: vmovaps %ymm1, 96(%rcx)
1255 ; AVX2-FAST-ALL-NEXT: vmovaps %ymm5, (%rcx)
1256 ; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, 32(%rcx)
1257 ; AVX2-FAST-ALL-NEXT: addq $72, %rsp
1258 ; AVX2-FAST-ALL-NEXT: vzeroupper
1259 ; AVX2-FAST-ALL-NEXT: retq
1261 ; AVX2-FAST-PERLANE-LABEL: load_i32_stride3_vf32:
1262 ; AVX2-FAST-PERLANE: # %bb.0:
1263 ; AVX2-FAST-PERLANE-NEXT: subq $136, %rsp
1264 ; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm15
1265 ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm3
1266 ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm2
1267 ; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm1
1268 ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm6
1269 ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm10
1270 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm4
1271 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1272 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm7
1273 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm0
1274 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1275 ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm14
1276 ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm12 = [21474836482,21474836482,21474836482,21474836482]
1277 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm12, %ymm0
1278 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5,6],ymm7[7]
1279 ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm13 = <0,3,6,1,4,7,u,u>
1280 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm13, %ymm5
1281 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
1282 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1283 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm12, %ymm0
1284 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0],ymm6[1],ymm14[2,3],ymm6[4],ymm14[5,6],ymm6[7]
1285 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm13, %ymm5
1286 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
1287 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1288 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm12, %ymm0
1289 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1290 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
1291 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1292 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm13, %ymm9
1293 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7]
1294 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1295 ; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm11
1296 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, %ymm4
1297 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm11[0],ymm15[1],ymm11[2,3],ymm15[4],ymm11[5,6],ymm15[7]
1298 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm13, %ymm13
1299 ; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm8
1300 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm12, %ymm12
1301 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm12[6,7]
1302 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
1303 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
1304 ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm3 = <1,4,7,2,5,u,u,u>
1305 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm3, %ymm12
1306 ; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6]
1307 ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1]
1308 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm0, %ymm15
1309 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm13
1310 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm15[5,6,7]
1311 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0,1],ymm6[2],ymm14[3,4],ymm6[5],ymm14[6,7]
1312 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm3, %ymm15
1313 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm0, %ymm5
1314 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4],ymm5[5,6,7]
1315 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1316 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm7[2],ymm1[3,4],ymm7[5],ymm1[6,7]
1317 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm3, %ymm5
1318 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1319 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm9
1320 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm9[5,6,7]
1321 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1],ymm4[2],ymm11[3,4],ymm4[5],ymm11[6,7]
1322 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm3, %ymm3
1323 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm0, %ymm0
1324 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
1325 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm14[2],ymm6[3,4],ymm14[5],ymm6[6,7]
1326 ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm9 = <2,5,0,3,6,u,u,u>
1327 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm9, %ymm3
1328 ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,0,3,4,5,4,7]
1329 ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3]
1330 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm10[5,6,7]
1331 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7]
1332 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm9, %ymm6
1333 ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm2[0,1,0,3,4,5,4,7]
1334 ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
1335 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7]
1336 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm11[2],ymm4[3,4],ymm11[5],ymm4[6,7]
1337 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm9, %ymm2
1338 ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[0,1,0,3,4,5,4,7]
1339 ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
1340 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
1341 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1342 ; AVX2-FAST-PERLANE-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
1343 ; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7]
1344 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm9, %ymm2
1345 ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm13[0,1,0,3,4,5,4,7]
1346 ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
1347 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7]
1348 ; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload
1349 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rsi)
1350 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
1351 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rsi)
1352 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
1353 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rsi)
1354 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
1355 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rsi)
1356 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rdx)
1357 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rdx)
1358 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, 32(%rdx)
1359 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, 64(%rdx)
1360 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rcx)
1361 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rcx)
1362 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rcx)
1363 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rcx)
1364 ; AVX2-FAST-PERLANE-NEXT: addq $136, %rsp
1365 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
1366 ; AVX2-FAST-PERLANE-NEXT: retq
1368 ; AVX512-LABEL: load_i32_stride3_vf32:
1370 ; AVX512-NEXT: vmovdqu64 320(%rdi), %zmm0
1371 ; AVX512-NEXT: vmovdqu64 256(%rdi), %zmm1
1372 ; AVX512-NEXT: vmovdqu64 (%rdi), %zmm2
1373 ; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm3
1374 ; AVX512-NEXT: vmovdqu64 128(%rdi), %zmm4
1375 ; AVX512-NEXT: vmovdqu64 192(%rdi), %zmm5
1376 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u>
1377 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7
1378 ; AVX512-NEXT: vpermt2d %zmm1, %zmm6, %zmm7
1379 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
1380 ; AVX512-NEXT: vpermt2d %zmm0, %zmm8, %zmm7
1381 ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm6
1382 ; AVX512-NEXT: vpermt2d %zmm4, %zmm8, %zmm6
1383 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = <17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u>
1384 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm9
1385 ; AVX512-NEXT: vpermt2d %zmm5, %zmm8, %zmm9
1386 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
1387 ; AVX512-NEXT: vpermt2d %zmm0, %zmm10, %zmm9
1388 ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm8
1389 ; AVX512-NEXT: vpermt2d %zmm4, %zmm10, %zmm8
1390 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = <2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u>
1391 ; AVX512-NEXT: vpermt2d %zmm1, %zmm10, %zmm5
1392 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
1393 ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm5
1394 ; AVX512-NEXT: vpermt2d %zmm3, %zmm10, %zmm2
1395 ; AVX512-NEXT: vpermt2d %zmm4, %zmm1, %zmm2
1396 ; AVX512-NEXT: vmovdqu64 %zmm7, 64(%rsi)
1397 ; AVX512-NEXT: vmovdqu64 %zmm6, (%rsi)
1398 ; AVX512-NEXT: vmovdqu64 %zmm9, 64(%rdx)
1399 ; AVX512-NEXT: vmovdqu64 %zmm8, (%rdx)
1400 ; AVX512-NEXT: vmovdqu64 %zmm5, 64(%rcx)
1401 ; AVX512-NEXT: vmovdqu64 %zmm2, (%rcx)
1402 ; AVX512-NEXT: vzeroupper
1404 %wide.vec = load <96 x i32>, ptr %in.vec, align 32
1406 %strided.vec0 = shufflevector <96 x i32> %wide.vec, <96 x i32> poison, <32 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45, i32 48, i32 51, i32 54, i32 57, i32 60, i32 63, i32 66, i32 69, i32 72, i32 75, i32 78, i32 81, i32 84, i32 87, i32 90, i32 93>
1407 %strided.vec1 = shufflevector <96 x i32> %wide.vec, <96 x i32> poison, <32 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46, i32 49, i32 52, i32 55, i32 58, i32 61, i32 64, i32 67, i32 70, i32 73, i32 76, i32 79, i32 82, i32 85, i32 88, i32 91, i32 94>
1408 %strided.vec2 = shufflevector <96 x i32> %wide.vec, <96 x i32> poison, <32 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47, i32 50, i32 53, i32 56, i32 59, i32 62, i32 65, i32 68, i32 71, i32 74, i32 77, i32 80, i32 83, i32 86, i32 89, i32 92, i32 95>
1410 store <32 x i32> %strided.vec0, ptr %out.vec0, align 32
1411 store <32 x i32> %strided.vec1, ptr %out.vec1, align 32
1412 store <32 x i32> %strided.vec2, ptr %out.vec2, align 32