1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
16 ; These patterns are produced by LoopVectorizer for interleaved stores.
18 define void @store_i32_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind {
19 ; SSE-LABEL: store_i32_stride3_vf2:
21 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
22 ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
23 ; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
24 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
25 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
26 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[1,0]
27 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[3,3,3,3]
28 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[0,2]
29 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
30 ; SSE-NEXT: movq %xmm3, 16(%rcx)
31 ; SSE-NEXT: movaps %xmm2, (%rcx)
34 ; AVX-LABEL: store_i32_stride3_vf2:
36 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
37 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
38 ; AVX-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
39 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
40 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2
41 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
42 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,0,3,7,5,4,7]
43 ; AVX-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,u,1,u,5,u,u]
44 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7]
45 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
46 ; AVX-NEXT: vmovlps %xmm1, 16(%rcx)
47 ; AVX-NEXT: vmovaps %xmm0, (%rcx)
48 ; AVX-NEXT: vzeroupper
51 ; AVX2-LABEL: store_i32_stride3_vf2:
53 ; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
54 ; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
55 ; AVX2-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
56 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
57 ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
58 ; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,1,3,5,u,u]
59 ; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
60 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
61 ; AVX2-NEXT: vmovlps %xmm1, 16(%rcx)
62 ; AVX2-NEXT: vmovaps %xmm0, (%rcx)
63 ; AVX2-NEXT: vzeroupper
66 ; AVX2-FP-LABEL: store_i32_stride3_vf2:
68 ; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
69 ; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
70 ; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
71 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
72 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
73 ; AVX2-FP-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,1,3,5,u,u]
74 ; AVX2-FP-NEXT: vpermps %ymm0, %ymm1, %ymm0
75 ; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm1
76 ; AVX2-FP-NEXT: vmovlps %xmm1, 16(%rcx)
77 ; AVX2-FP-NEXT: vmovaps %xmm0, (%rcx)
78 ; AVX2-FP-NEXT: vzeroupper
81 ; AVX2-FCP-LABEL: store_i32_stride3_vf2:
83 ; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
84 ; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
85 ; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
86 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
87 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
88 ; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,1,3,5,u,u]
89 ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm0
90 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm1
91 ; AVX2-FCP-NEXT: vmovlps %xmm1, 16(%rcx)
92 ; AVX2-FCP-NEXT: vmovaps %xmm0, (%rcx)
93 ; AVX2-FCP-NEXT: vzeroupper
96 ; AVX512-LABEL: store_i32_stride3_vf2:
98 ; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
99 ; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
100 ; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
101 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
102 ; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
103 ; AVX512-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,1,3,5,u,u]
104 ; AVX512-NEXT: vpermps %ymm0, %ymm1, %ymm0
105 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
106 ; AVX512-NEXT: vmovlps %xmm1, 16(%rcx)
107 ; AVX512-NEXT: vmovaps %xmm0, (%rcx)
108 ; AVX512-NEXT: vzeroupper
111 ; AVX512-FCP-LABEL: store_i32_stride3_vf2:
112 ; AVX512-FCP: # %bb.0:
113 ; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
114 ; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
115 ; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
116 ; AVX512-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
117 ; AVX512-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
118 ; AVX512-FCP-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,1,3,5,u,u]
119 ; AVX512-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm0
120 ; AVX512-FCP-NEXT: vextractf128 $1, %ymm0, %xmm1
121 ; AVX512-FCP-NEXT: vmovlps %xmm1, 16(%rcx)
122 ; AVX512-FCP-NEXT: vmovaps %xmm0, (%rcx)
123 ; AVX512-FCP-NEXT: vzeroupper
124 ; AVX512-FCP-NEXT: retq
126 ; AVX512DQ-LABEL: store_i32_stride3_vf2:
128 ; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
129 ; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
130 ; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
131 ; AVX512DQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
132 ; AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
133 ; AVX512DQ-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,1,3,5,u,u]
134 ; AVX512DQ-NEXT: vpermps %ymm0, %ymm1, %ymm0
135 ; AVX512DQ-NEXT: vextractf128 $1, %ymm0, %xmm1
136 ; AVX512DQ-NEXT: vmovlps %xmm1, 16(%rcx)
137 ; AVX512DQ-NEXT: vmovaps %xmm0, (%rcx)
138 ; AVX512DQ-NEXT: vzeroupper
139 ; AVX512DQ-NEXT: retq
141 ; AVX512DQ-FCP-LABEL: store_i32_stride3_vf2:
142 ; AVX512DQ-FCP: # %bb.0:
143 ; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
144 ; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
145 ; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
146 ; AVX512DQ-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
147 ; AVX512DQ-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
148 ; AVX512DQ-FCP-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,1,3,5,u,u]
149 ; AVX512DQ-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm0
150 ; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm0, %xmm1
151 ; AVX512DQ-FCP-NEXT: vmovlps %xmm1, 16(%rcx)
152 ; AVX512DQ-FCP-NEXT: vmovaps %xmm0, (%rcx)
153 ; AVX512DQ-FCP-NEXT: vzeroupper
154 ; AVX512DQ-FCP-NEXT: retq
156 ; AVX512BW-LABEL: store_i32_stride3_vf2:
158 ; AVX512BW-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
159 ; AVX512BW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
160 ; AVX512BW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
161 ; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
162 ; AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
163 ; AVX512BW-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,1,3,5,u,u]
164 ; AVX512BW-NEXT: vpermps %ymm0, %ymm1, %ymm0
165 ; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm1
166 ; AVX512BW-NEXT: vmovlps %xmm1, 16(%rcx)
167 ; AVX512BW-NEXT: vmovaps %xmm0, (%rcx)
168 ; AVX512BW-NEXT: vzeroupper
169 ; AVX512BW-NEXT: retq
171 ; AVX512BW-FCP-LABEL: store_i32_stride3_vf2:
172 ; AVX512BW-FCP: # %bb.0:
173 ; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
174 ; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
175 ; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
176 ; AVX512BW-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
177 ; AVX512BW-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
178 ; AVX512BW-FCP-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,1,3,5,u,u]
179 ; AVX512BW-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm0
180 ; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm1
181 ; AVX512BW-FCP-NEXT: vmovlps %xmm1, 16(%rcx)
182 ; AVX512BW-FCP-NEXT: vmovaps %xmm0, (%rcx)
183 ; AVX512BW-FCP-NEXT: vzeroupper
184 ; AVX512BW-FCP-NEXT: retq
186 ; AVX512DQ-BW-LABEL: store_i32_stride3_vf2:
187 ; AVX512DQ-BW: # %bb.0:
188 ; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
189 ; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
190 ; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
191 ; AVX512DQ-BW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
192 ; AVX512DQ-BW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
193 ; AVX512DQ-BW-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,1,3,5,u,u]
194 ; AVX512DQ-BW-NEXT: vpermps %ymm0, %ymm1, %ymm0
195 ; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm0, %xmm1
196 ; AVX512DQ-BW-NEXT: vmovlps %xmm1, 16(%rcx)
197 ; AVX512DQ-BW-NEXT: vmovaps %xmm0, (%rcx)
198 ; AVX512DQ-BW-NEXT: vzeroupper
199 ; AVX512DQ-BW-NEXT: retq
201 ; AVX512DQ-BW-FCP-LABEL: store_i32_stride3_vf2:
202 ; AVX512DQ-BW-FCP: # %bb.0:
203 ; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
204 ; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
205 ; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
206 ; AVX512DQ-BW-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
207 ; AVX512DQ-BW-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
208 ; AVX512DQ-BW-FCP-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,1,3,5,u,u]
209 ; AVX512DQ-BW-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm0
210 ; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm1
211 ; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, 16(%rcx)
212 ; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm0, (%rcx)
213 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
214 ; AVX512DQ-BW-FCP-NEXT: retq
215 %in.vec0 = load <2 x i32>, ptr %in.vecptr0, align 64
216 %in.vec1 = load <2 x i32>, ptr %in.vecptr1, align 64
217 %in.vec2 = load <2 x i32>, ptr %in.vecptr2, align 64
218 %1 = shufflevector <2 x i32> %in.vec0, <2 x i32> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
219 %2 = shufflevector <2 x i32> %in.vec2, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
220 %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
221 %interleaved.vec = shufflevector <6 x i32> %3, <6 x i32> poison, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
222 store <6 x i32> %interleaved.vec, ptr %out.vec, align 64
226 define void @store_i32_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind {
227 ; SSE-LABEL: store_i32_stride3_vf4:
229 ; SSE-NEXT: movaps (%rdi), %xmm0
230 ; SSE-NEXT: movaps (%rsi), %xmm1
231 ; SSE-NEXT: movaps (%rdx), %xmm2
232 ; SSE-NEXT: movaps %xmm0, %xmm3
233 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[0,3]
234 ; SSE-NEXT: movaps %xmm0, %xmm4
235 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
236 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
237 ; SSE-NEXT: movaps %xmm0, %xmm3
238 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
239 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3]
240 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
241 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
242 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm2[2,3]
243 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
244 ; SSE-NEXT: movaps %xmm1, 16(%rcx)
245 ; SSE-NEXT: movaps %xmm4, (%rcx)
246 ; SSE-NEXT: movaps %xmm0, 32(%rcx)
249 ; AVX-LABEL: store_i32_stride3_vf4:
251 ; AVX-NEXT: vmovaps (%rdi), %xmm0
252 ; AVX-NEXT: vmovaps (%rsi), %xmm1
253 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
254 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm3
255 ; AVX-NEXT: vmovsldup {{.*#+}} ymm3 = ymm3[0,0,2,2,4,4,6,6]
256 ; AVX-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,u,u,1,5,u,u,6]
257 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5],ymm3[6],ymm2[7]
258 ; AVX-NEXT: vbroadcastsd (%rdx), %ymm3
259 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
260 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3]
261 ; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
262 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
263 ; AVX-NEXT: vmovaps %xmm0, 32(%rcx)
264 ; AVX-NEXT: vmovaps %ymm2, (%rcx)
265 ; AVX-NEXT: vzeroupper
268 ; AVX2-LABEL: store_i32_stride3_vf4:
270 ; AVX2-NEXT: vmovaps (%rdi), %xmm0
271 ; AVX2-NEXT: vmovaps (%rsi), %xmm1
272 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
273 ; AVX2-NEXT: vmovaps {{.*#+}} ymm3 = [0,4,u,1,5,u,2,6]
274 ; AVX2-NEXT: vpermps %ymm2, %ymm3, %ymm2
275 ; AVX2-NEXT: vbroadcastsd (%rdx), %ymm3
276 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
277 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3]
278 ; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
279 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
280 ; AVX2-NEXT: vmovaps %xmm0, 32(%rcx)
281 ; AVX2-NEXT: vmovaps %ymm2, (%rcx)
282 ; AVX2-NEXT: vzeroupper
285 ; AVX2-FP-LABEL: store_i32_stride3_vf4:
287 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm0
288 ; AVX2-FP-NEXT: vmovaps (%rsi), %xmm1
289 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
290 ; AVX2-FP-NEXT: vmovaps {{.*#+}} ymm3 = [0,4,u,1,5,u,2,6]
291 ; AVX2-FP-NEXT: vpermps %ymm2, %ymm3, %ymm2
292 ; AVX2-FP-NEXT: vbroadcastsd (%rdx), %ymm3
293 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
294 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3]
295 ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
296 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
297 ; AVX2-FP-NEXT: vmovaps %xmm0, 32(%rcx)
298 ; AVX2-FP-NEXT: vmovaps %ymm2, (%rcx)
299 ; AVX2-FP-NEXT: vzeroupper
302 ; AVX2-FCP-LABEL: store_i32_stride3_vf4:
304 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm0
305 ; AVX2-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
306 ; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm1 = [7,3,7,3,7,3,7,3]
307 ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm1
308 ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
309 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3]
310 ; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm2 = [0,4,u,1,5,u,2,6]
311 ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm2, %ymm0
312 ; AVX2-FCP-NEXT: vbroadcastsd (%rdx), %ymm2
313 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
314 ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rcx)
315 ; AVX2-FCP-NEXT: vmovaps %xmm1, 32(%rcx)
316 ; AVX2-FCP-NEXT: vzeroupper
317 ; AVX2-FCP-NEXT: retq
319 ; AVX512-LABEL: store_i32_stride3_vf4:
321 ; AVX512-NEXT: vmovaps (%rdi), %xmm0
322 ; AVX512-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
323 ; AVX512-NEXT: vinsertf32x4 $2, (%rdx), %zmm0, %zmm0
324 ; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u]
325 ; AVX512-NEXT: vpermps %zmm0, %zmm1, %zmm0
326 ; AVX512-NEXT: vextractf32x4 $2, %zmm0, 32(%rcx)
327 ; AVX512-NEXT: vmovaps %ymm0, (%rcx)
328 ; AVX512-NEXT: vzeroupper
331 ; AVX512-FCP-LABEL: store_i32_stride3_vf4:
332 ; AVX512-FCP: # %bb.0:
333 ; AVX512-FCP-NEXT: vmovaps (%rdi), %xmm0
334 ; AVX512-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
335 ; AVX512-FCP-NEXT: vinsertf32x4 $2, (%rdx), %zmm0, %zmm0
336 ; AVX512-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u]
337 ; AVX512-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0
338 ; AVX512-FCP-NEXT: vextractf32x4 $2, %zmm0, 32(%rcx)
339 ; AVX512-FCP-NEXT: vmovaps %ymm0, (%rcx)
340 ; AVX512-FCP-NEXT: vzeroupper
341 ; AVX512-FCP-NEXT: retq
343 ; AVX512DQ-LABEL: store_i32_stride3_vf4:
345 ; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0
346 ; AVX512DQ-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
347 ; AVX512DQ-NEXT: vinsertf32x4 $2, (%rdx), %zmm0, %zmm0
348 ; AVX512DQ-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u]
349 ; AVX512DQ-NEXT: vpermps %zmm0, %zmm1, %zmm0
350 ; AVX512DQ-NEXT: vextractf32x4 $2, %zmm0, 32(%rcx)
351 ; AVX512DQ-NEXT: vmovaps %ymm0, (%rcx)
352 ; AVX512DQ-NEXT: vzeroupper
353 ; AVX512DQ-NEXT: retq
355 ; AVX512DQ-FCP-LABEL: store_i32_stride3_vf4:
356 ; AVX512DQ-FCP: # %bb.0:
357 ; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %xmm0
358 ; AVX512DQ-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
359 ; AVX512DQ-FCP-NEXT: vinsertf32x4 $2, (%rdx), %zmm0, %zmm0
360 ; AVX512DQ-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u]
361 ; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0
362 ; AVX512DQ-FCP-NEXT: vextractf32x4 $2, %zmm0, 32(%rcx)
363 ; AVX512DQ-FCP-NEXT: vmovaps %ymm0, (%rcx)
364 ; AVX512DQ-FCP-NEXT: vzeroupper
365 ; AVX512DQ-FCP-NEXT: retq
367 ; AVX512BW-LABEL: store_i32_stride3_vf4:
369 ; AVX512BW-NEXT: vmovaps (%rdi), %xmm0
370 ; AVX512BW-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
371 ; AVX512BW-NEXT: vinsertf32x4 $2, (%rdx), %zmm0, %zmm0
372 ; AVX512BW-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u]
373 ; AVX512BW-NEXT: vpermps %zmm0, %zmm1, %zmm0
374 ; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, 32(%rcx)
375 ; AVX512BW-NEXT: vmovaps %ymm0, (%rcx)
376 ; AVX512BW-NEXT: vzeroupper
377 ; AVX512BW-NEXT: retq
379 ; AVX512BW-FCP-LABEL: store_i32_stride3_vf4:
380 ; AVX512BW-FCP: # %bb.0:
381 ; AVX512BW-FCP-NEXT: vmovaps (%rdi), %xmm0
382 ; AVX512BW-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
383 ; AVX512BW-FCP-NEXT: vinsertf32x4 $2, (%rdx), %zmm0, %zmm0
384 ; AVX512BW-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u]
385 ; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0
386 ; AVX512BW-FCP-NEXT: vextractf32x4 $2, %zmm0, 32(%rcx)
387 ; AVX512BW-FCP-NEXT: vmovaps %ymm0, (%rcx)
388 ; AVX512BW-FCP-NEXT: vzeroupper
389 ; AVX512BW-FCP-NEXT: retq
391 ; AVX512DQ-BW-LABEL: store_i32_stride3_vf4:
392 ; AVX512DQ-BW: # %bb.0:
393 ; AVX512DQ-BW-NEXT: vmovaps (%rdi), %xmm0
394 ; AVX512DQ-BW-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
395 ; AVX512DQ-BW-NEXT: vinsertf32x4 $2, (%rdx), %zmm0, %zmm0
396 ; AVX512DQ-BW-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u]
397 ; AVX512DQ-BW-NEXT: vpermps %zmm0, %zmm1, %zmm0
398 ; AVX512DQ-BW-NEXT: vextractf32x4 $2, %zmm0, 32(%rcx)
399 ; AVX512DQ-BW-NEXT: vmovaps %ymm0, (%rcx)
400 ; AVX512DQ-BW-NEXT: vzeroupper
401 ; AVX512DQ-BW-NEXT: retq
403 ; AVX512DQ-BW-FCP-LABEL: store_i32_stride3_vf4:
404 ; AVX512DQ-BW-FCP: # %bb.0:
405 ; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %xmm0
406 ; AVX512DQ-BW-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
407 ; AVX512DQ-BW-FCP-NEXT: vinsertf32x4 $2, (%rdx), %zmm0, %zmm0
408 ; AVX512DQ-BW-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u]
409 ; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0
410 ; AVX512DQ-BW-FCP-NEXT: vextractf32x4 $2, %zmm0, 32(%rcx)
411 ; AVX512DQ-BW-FCP-NEXT: vmovaps %ymm0, (%rcx)
412 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
413 ; AVX512DQ-BW-FCP-NEXT: retq
414 %in.vec0 = load <4 x i32>, ptr %in.vecptr0, align 64
415 %in.vec1 = load <4 x i32>, ptr %in.vecptr1, align 64
416 %in.vec2 = load <4 x i32>, ptr %in.vecptr2, align 64
417 %1 = shufflevector <4 x i32> %in.vec0, <4 x i32> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
418 %2 = shufflevector <4 x i32> %in.vec2, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
419 %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
420 %interleaved.vec = shufflevector <12 x i32> %3, <12 x i32> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
421 store <12 x i32> %interleaved.vec, ptr %out.vec, align 64
425 define void @store_i32_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind {
426 ; SSE-LABEL: store_i32_stride3_vf8:
428 ; SSE-NEXT: movaps (%rdi), %xmm1
429 ; SSE-NEXT: movaps 16(%rdi), %xmm0
430 ; SSE-NEXT: movaps (%rsi), %xmm5
431 ; SSE-NEXT: movaps 16(%rsi), %xmm6
432 ; SSE-NEXT: movaps (%rdx), %xmm2
433 ; SSE-NEXT: movaps 16(%rdx), %xmm3
434 ; SSE-NEXT: movaps %xmm0, %xmm7
435 ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm6[1]
436 ; SSE-NEXT: movaps %xmm0, %xmm8
437 ; SSE-NEXT: movaps %xmm0, %xmm4
438 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
439 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm6[3,3]
440 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm3[1,1]
441 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2]
442 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm3[0,3]
443 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,0]
444 ; SSE-NEXT: movaps %xmm1, %xmm7
445 ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm5[1]
446 ; SSE-NEXT: movaps %xmm1, %xmm8
447 ; SSE-NEXT: movaps %xmm1, %xmm9
448 ; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1]
449 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm5[3,3]
450 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1]
451 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm7[0,2]
452 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm2[0,3]
453 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm8[2,0]
454 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm3[2,3]
455 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
456 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm2[2,3]
457 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
458 ; SSE-NEXT: movaps %xmm9, (%rcx)
459 ; SSE-NEXT: movaps %xmm5, 16(%rcx)
460 ; SSE-NEXT: movaps %xmm4, 48(%rcx)
461 ; SSE-NEXT: movaps %xmm6, 64(%rcx)
462 ; SSE-NEXT: movaps %xmm1, 32(%rcx)
463 ; SSE-NEXT: movaps %xmm0, 80(%rcx)
466 ; AVX-LABEL: store_i32_stride3_vf8:
468 ; AVX-NEXT: vmovapd (%rdx), %ymm0
469 ; AVX-NEXT: vmovaps (%rsi), %xmm1
470 ; AVX-NEXT: vmovaps 16(%rsi), %xmm2
471 ; AVX-NEXT: vmovaps (%rdi), %xmm3
472 ; AVX-NEXT: vmovaps 16(%rdi), %xmm4
473 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm3[1],xmm1[1]
474 ; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm1[1,1],xmm5[0,2]
475 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
476 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,1]
477 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
478 ; AVX-NEXT: vbroadcastsd (%rdx), %ymm3
479 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7]
480 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm4[3,3],xmm2[3,3]
481 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
482 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1],xmm4[0,2]
483 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
484 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
485 ; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0,0,3,3]
486 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
487 ; AVX-NEXT: vpermilps {{.*#+}} ymm3 = mem[0,0,3,3,4,4,7,7]
488 ; AVX-NEXT: vpermilpd {{.*#+}} ymm4 = mem[1,0,2,2]
489 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
490 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
491 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7]
492 ; AVX-NEXT: vmovaps %ymm0, 32(%rcx)
493 ; AVX-NEXT: vmovaps %ymm2, 64(%rcx)
494 ; AVX-NEXT: vmovaps %ymm1, (%rcx)
495 ; AVX-NEXT: vzeroupper
498 ; AVX2-LABEL: store_i32_stride3_vf8:
500 ; AVX2-NEXT: vmovaps (%rdi), %ymm0
501 ; AVX2-NEXT: vmovaps (%rsi), %ymm1
502 ; AVX2-NEXT: vmovaps (%rdx), %ymm2
503 ; AVX2-NEXT: vpermilps {{.*#+}} xmm3 = mem[1,0,2,2]
504 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,1]
505 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1]
506 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
507 ; AVX2-NEXT: vbroadcastsd (%rdx), %ymm4
508 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
509 ; AVX2-NEXT: vbroadcastsd 24(%rdi), %ymm4
510 ; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7]
511 ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3]
512 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
513 ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[2,1,3,3]
514 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7]
515 ; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
516 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
517 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
518 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[1,1,2,2]
519 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
520 ; AVX2-NEXT: vmovaps %ymm0, 32(%rcx)
521 ; AVX2-NEXT: vmovaps %ymm4, 64(%rcx)
522 ; AVX2-NEXT: vmovaps %ymm3, (%rcx)
523 ; AVX2-NEXT: vzeroupper
526 ; AVX2-FP-LABEL: store_i32_stride3_vf8:
528 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0
529 ; AVX2-FP-NEXT: vmovaps (%rsi), %ymm1
530 ; AVX2-FP-NEXT: vmovaps (%rdx), %ymm2
531 ; AVX2-FP-NEXT: vpermilps {{.*#+}} xmm3 = mem[1,0,2,2]
532 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,1]
533 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1]
534 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
535 ; AVX2-FP-NEXT: vbroadcastsd (%rdx), %ymm4
536 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
537 ; AVX2-FP-NEXT: vbroadcastsd 24(%rdi), %ymm4
538 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7]
539 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3]
540 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
541 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[2,1,3,3]
542 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7]
543 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
544 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
545 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
546 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[1,1,2,2]
547 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
548 ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rcx)
549 ; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rcx)
550 ; AVX2-FP-NEXT: vmovaps %ymm3, (%rcx)
551 ; AVX2-FP-NEXT: vzeroupper
554 ; AVX2-FCP-LABEL: store_i32_stride3_vf8:
556 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0
557 ; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm1
558 ; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm2
559 ; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [1,0,2,2,1,0,2,2]
560 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1]
561 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm3, %ymm3
562 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1]
563 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
564 ; AVX2-FCP-NEXT: vbroadcastsd (%rdx), %ymm4
565 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
566 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm1[0,0,3,3,4,4,7,7]
567 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
568 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7]
569 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[1,1,2,2]
570 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7]
571 ; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [5,0,7,6,5,0,7,6]
572 ; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1]
573 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm1
574 ; AVX2-FCP-NEXT: vbroadcastsd 24(%rdi), %ymm4
575 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7]
576 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3]
577 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
578 ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rcx)
579 ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rcx)
580 ; AVX2-FCP-NEXT: vmovaps %ymm3, (%rcx)
581 ; AVX2-FCP-NEXT: vzeroupper
582 ; AVX2-FCP-NEXT: retq
584 ; AVX512-LABEL: store_i32_stride3_vf8:
586 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0
587 ; AVX512-NEXT: vmovdqa (%rdx), %ymm1
588 ; AVX512-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
589 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23]
590 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
591 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5]
592 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
593 ; AVX512-NEXT: vmovdqa64 %zmm3, (%rcx)
594 ; AVX512-NEXT: vmovdqa %ymm2, 64(%rcx)
595 ; AVX512-NEXT: vzeroupper
598 ; AVX512-FCP-LABEL: store_i32_stride3_vf8:
599 ; AVX512-FCP: # %bb.0:
600 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0
601 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm1
602 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
603 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23]
604 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
605 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5]
606 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
607 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rcx)
608 ; AVX512-FCP-NEXT: vmovdqa %ymm2, 64(%rcx)
609 ; AVX512-FCP-NEXT: vzeroupper
610 ; AVX512-FCP-NEXT: retq
612 ; AVX512DQ-LABEL: store_i32_stride3_vf8:
614 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
615 ; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm1
616 ; AVX512DQ-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
617 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23]
618 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
619 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5]
620 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
621 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rcx)
622 ; AVX512DQ-NEXT: vmovdqa %ymm2, 64(%rcx)
623 ; AVX512DQ-NEXT: vzeroupper
624 ; AVX512DQ-NEXT: retq
626 ; AVX512DQ-FCP-LABEL: store_i32_stride3_vf8:
627 ; AVX512DQ-FCP: # %bb.0:
628 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0
629 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm1
630 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
631 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23]
632 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
633 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5]
634 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
635 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rcx)
636 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, 64(%rcx)
637 ; AVX512DQ-FCP-NEXT: vzeroupper
638 ; AVX512DQ-FCP-NEXT: retq
640 ; AVX512BW-LABEL: store_i32_stride3_vf8:
642 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
643 ; AVX512BW-NEXT: vmovdqa (%rdx), %ymm1
644 ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
645 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23]
646 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
647 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5]
648 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
649 ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rcx)
650 ; AVX512BW-NEXT: vmovdqa %ymm2, 64(%rcx)
651 ; AVX512BW-NEXT: vzeroupper
652 ; AVX512BW-NEXT: retq
654 ; AVX512BW-FCP-LABEL: store_i32_stride3_vf8:
655 ; AVX512BW-FCP: # %bb.0:
656 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0
657 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm1
658 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
659 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23]
660 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
661 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5]
662 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
663 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rcx)
664 ; AVX512BW-FCP-NEXT: vmovdqa %ymm2, 64(%rcx)
665 ; AVX512BW-FCP-NEXT: vzeroupper
666 ; AVX512BW-FCP-NEXT: retq
668 ; AVX512DQ-BW-LABEL: store_i32_stride3_vf8:
669 ; AVX512DQ-BW: # %bb.0:
670 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0
671 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm1
672 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
673 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23]
674 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
675 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5]
676 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
677 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rcx)
678 ; AVX512DQ-BW-NEXT: vmovdqa %ymm2, 64(%rcx)
679 ; AVX512DQ-BW-NEXT: vzeroupper
680 ; AVX512DQ-BW-NEXT: retq
682 ; AVX512DQ-BW-FCP-LABEL: store_i32_stride3_vf8:
683 ; AVX512DQ-BW-FCP: # %bb.0:
684 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0
685 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm1
686 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
687 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23]
688 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
689 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5]
690 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
691 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rcx)
692 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, 64(%rcx)
693 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
694 ; AVX512DQ-BW-FCP-NEXT: retq
695 %in.vec0 = load <8 x i32>, ptr %in.vecptr0, align 64
696 %in.vec1 = load <8 x i32>, ptr %in.vecptr1, align 64
697 %in.vec2 = load <8 x i32>, ptr %in.vecptr2, align 64
698 %1 = shufflevector <8 x i32> %in.vec0, <8 x i32> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
699 %2 = shufflevector <8 x i32> %in.vec2, <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
700 %3 = shufflevector <16 x i32> %1, <16 x i32> %2, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
701 %interleaved.vec = shufflevector <24 x i32> %3, <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
702 store <24 x i32> %interleaved.vec, ptr %out.vec, align 64
706 define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind {
707 ; SSE-LABEL: store_i32_stride3_vf16:
709 ; SSE-NEXT: movaps (%rdi), %xmm1
710 ; SSE-NEXT: movaps 16(%rdi), %xmm2
711 ; SSE-NEXT: movaps 32(%rdi), %xmm4
712 ; SSE-NEXT: movaps 48(%rdi), %xmm5
713 ; SSE-NEXT: movaps (%rsi), %xmm7
714 ; SSE-NEXT: movaps 16(%rsi), %xmm9
715 ; SSE-NEXT: movaps 32(%rsi), %xmm10
716 ; SSE-NEXT: movaps 48(%rsi), %xmm11
717 ; SSE-NEXT: movaps 16(%rdx), %xmm0
718 ; SSE-NEXT: movaps 32(%rdx), %xmm3
719 ; SSE-NEXT: movaps 48(%rdx), %xmm8
720 ; SSE-NEXT: movaps %xmm5, %xmm12
721 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1]
722 ; SSE-NEXT: movaps %xmm5, %xmm13
723 ; SSE-NEXT: movaps %xmm5, %xmm6
724 ; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1]
725 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm11[3,3]
726 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
727 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm8[1,1]
728 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm12[0,2]
729 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm8[0,3]
730 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm13[2,0]
731 ; SSE-NEXT: movaps %xmm4, %xmm13
732 ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm10[1]
733 ; SSE-NEXT: movaps %xmm4, %xmm14
734 ; SSE-NEXT: movaps %xmm4, %xmm12
735 ; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
736 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm10[3,3]
737 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
738 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm3[1,1]
739 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm13[0,2]
740 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm3[0,3]
741 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm14[2,0]
742 ; SSE-NEXT: movaps %xmm2, %xmm14
743 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm9[1]
744 ; SSE-NEXT: movaps %xmm2, %xmm15
745 ; SSE-NEXT: movaps %xmm2, %xmm13
746 ; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1]
747 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm9[3,3]
748 ; SSE-NEXT: movaps %xmm0, %xmm8
749 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm0[1,1]
750 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm14[0,2]
751 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm0[0,3]
752 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,0]
753 ; SSE-NEXT: movaps %xmm1, %xmm14
754 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm7[1]
755 ; SSE-NEXT: movaps %xmm1, %xmm15
756 ; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1]
757 ; SSE-NEXT: movaps %xmm1, %xmm3
758 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm7[3,3]
759 ; SSE-NEXT: movaps (%rdx), %xmm0
760 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm0[1,1]
761 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm14[0,2]
762 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[0,3]
763 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm3[2,0]
764 ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
765 ; SSE-NEXT: # xmm5 = xmm5[1,2],mem[2,3]
766 ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
767 ; SSE-NEXT: # xmm4 = xmm4[1,2],mem[2,3]
768 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm8[2,3]
769 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[2,3]
770 ; SSE-NEXT: movaps %xmm15, (%rcx)
771 ; SSE-NEXT: movaps %xmm7, 16(%rcx)
772 ; SSE-NEXT: movaps %xmm13, 48(%rcx)
773 ; SSE-NEXT: movaps %xmm9, 64(%rcx)
774 ; SSE-NEXT: movaps %xmm12, 96(%rcx)
775 ; SSE-NEXT: movaps %xmm10, 112(%rcx)
776 ; SSE-NEXT: movaps %xmm6, 144(%rcx)
777 ; SSE-NEXT: movaps %xmm11, 160(%rcx)
778 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
779 ; SSE-NEXT: movaps %xmm1, 32(%rcx)
780 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0,1,3]
781 ; SSE-NEXT: movaps %xmm2, 80(%rcx)
782 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0,1,3]
783 ; SSE-NEXT: movaps %xmm4, 128(%rcx)
784 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0,1,3]
785 ; SSE-NEXT: movaps %xmm5, 176(%rcx)
788 ; AVX-LABEL: store_i32_stride3_vf16:
790 ; AVX-NEXT: vmovapd (%rdx), %ymm1
791 ; AVX-NEXT: vmovapd 32(%rdx), %ymm0
792 ; AVX-NEXT: vmovaps (%rsi), %xmm2
793 ; AVX-NEXT: vmovaps 16(%rsi), %xmm3
794 ; AVX-NEXT: vmovaps 32(%rsi), %xmm4
795 ; AVX-NEXT: vmovaps 48(%rsi), %xmm5
796 ; AVX-NEXT: vmovaps (%rdi), %xmm6
797 ; AVX-NEXT: vmovaps 16(%rdi), %xmm7
798 ; AVX-NEXT: vmovaps 32(%rdi), %xmm8
799 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm6[1],xmm2[1]
800 ; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm2[1,1],xmm9[0,2]
801 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm6[0]
802 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm6[2,1]
803 ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm2
804 ; AVX-NEXT: vbroadcastsd (%rdx), %ymm6
805 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7]
806 ; AVX-NEXT: vmovaps 48(%rdi), %xmm6
807 ; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm6[3,3],xmm5[3,3]
808 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1]
809 ; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,1],xmm6[0,2]
810 ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm5, %ymm5
811 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3,2,3]
812 ; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0,0,3,3]
813 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7]
814 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm8[1],xmm4[1]
815 ; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm4[1,1],xmm6[0,2]
816 ; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm8[0]
817 ; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm8[2,1]
818 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
819 ; AVX-NEXT: vbroadcastsd 32(%rdx), %ymm6
820 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7]
821 ; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm7[3,3],xmm3[3,3]
822 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1]
823 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1],xmm7[0,2]
824 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3
825 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3,2,3]
826 ; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0,0,3,3]
827 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5,6],ymm6[7]
828 ; AVX-NEXT: vpermilps {{.*#+}} ymm6 = mem[0,0,3,3,4,4,7,7]
829 ; AVX-NEXT: vpermilpd {{.*#+}} ymm7 = mem[1,0,2,2]
830 ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7]
831 ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1,1,2,2]
832 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3],ymm6[4,5],ymm1[6],ymm6[7]
833 ; AVX-NEXT: vpermilps {{.*#+}} ymm6 = mem[0,0,3,3,4,4,7,7]
834 ; AVX-NEXT: vpermilpd {{.*#+}} ymm7 = mem[1,0,2,2]
835 ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7]
836 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
837 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1,2],ymm0[3],ymm6[4,5],ymm0[6],ymm6[7]
838 ; AVX-NEXT: vmovaps %ymm0, 128(%rcx)
839 ; AVX-NEXT: vmovaps %ymm1, 32(%rcx)
840 ; AVX-NEXT: vmovaps %ymm3, 64(%rcx)
841 ; AVX-NEXT: vmovaps %ymm4, 96(%rcx)
842 ; AVX-NEXT: vmovaps %ymm5, 160(%rcx)
843 ; AVX-NEXT: vmovaps %ymm2, (%rcx)
844 ; AVX-NEXT: vzeroupper
847 ; AVX2-LABEL: store_i32_stride3_vf16:
849 ; AVX2-NEXT: vmovaps (%rdi), %ymm4
850 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm0
851 ; AVX2-NEXT: vmovaps (%rsi), %ymm5
852 ; AVX2-NEXT: vmovaps 32(%rsi), %ymm3
853 ; AVX2-NEXT: vmovaps (%rdx), %ymm6
854 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1
855 ; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = mem[1,0,2,2]
856 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,1]
857 ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm4[0,0,2,1]
858 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6],ymm2[7]
859 ; AVX2-NEXT: vbroadcastsd (%rdx), %ymm7
860 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7]
861 ; AVX2-NEXT: vbroadcastsd 56(%rdi), %ymm7
862 ; AVX2-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,2,3,3,5,6,7,7]
863 ; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3]
864 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7]
865 ; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm1[2,1,3,3]
866 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6],ymm8[7]
867 ; AVX2-NEXT: vpermilps {{.*#+}} xmm8 = mem[1,0,2,2]
868 ; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,1]
869 ; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[0,0,2,1]
870 ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6],ymm8[7]
871 ; AVX2-NEXT: vbroadcastsd 32(%rdx), %ymm9
872 ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
873 ; AVX2-NEXT: vbroadcastsd 24(%rdi), %ymm9
874 ; AVX2-NEXT: vshufps {{.*#+}} ymm10 = ymm5[1,2,3,3,5,6,7,7]
875 ; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,3]
876 ; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7]
877 ; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[2,1,3,3]
878 ; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6],ymm10[7]
879 ; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7]
880 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[1,1,2,2]
881 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
882 ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm6[1,1,2,2]
883 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
884 ; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,3,3,4,4,7,7]
885 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
886 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7]
887 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2]
888 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
889 ; AVX2-NEXT: vmovaps %ymm0, 128(%rcx)
890 ; AVX2-NEXT: vmovaps %ymm4, 32(%rcx)
891 ; AVX2-NEXT: vmovaps %ymm9, 64(%rcx)
892 ; AVX2-NEXT: vmovaps %ymm8, 96(%rcx)
893 ; AVX2-NEXT: vmovaps %ymm7, 160(%rcx)
894 ; AVX2-NEXT: vmovaps %ymm2, (%rcx)
895 ; AVX2-NEXT: vzeroupper
898 ; AVX2-FP-LABEL: store_i32_stride3_vf16:
900 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm4
901 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm0
902 ; AVX2-FP-NEXT: vmovaps (%rsi), %ymm5
903 ; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm3
904 ; AVX2-FP-NEXT: vmovaps (%rdx), %ymm6
905 ; AVX2-FP-NEXT: vmovaps 32(%rdx), %ymm1
906 ; AVX2-FP-NEXT: vpermilps {{.*#+}} xmm2 = mem[1,0,2,2]
907 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,1]
908 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm4[0,0,2,1]
909 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6],ymm2[7]
910 ; AVX2-FP-NEXT: vbroadcastsd (%rdx), %ymm7
911 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7]
912 ; AVX2-FP-NEXT: vbroadcastsd 56(%rdi), %ymm7
913 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,2,3,3,5,6,7,7]
914 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3]
915 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7]
916 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm1[2,1,3,3]
917 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6],ymm8[7]
918 ; AVX2-FP-NEXT: vpermilps {{.*#+}} xmm8 = mem[1,0,2,2]
919 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,1]
920 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[0,0,2,1]
921 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6],ymm8[7]
922 ; AVX2-FP-NEXT: vbroadcastsd 32(%rdx), %ymm9
923 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
924 ; AVX2-FP-NEXT: vbroadcastsd 24(%rdi), %ymm9
925 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm10 = ymm5[1,2,3,3,5,6,7,7]
926 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,3]
927 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7]
928 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[2,1,3,3]
929 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6],ymm10[7]
930 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7]
931 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[1,1,2,2]
932 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
933 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm6[1,1,2,2]
934 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
935 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,3,3,4,4,7,7]
936 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
937 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7]
938 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2]
939 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
940 ; AVX2-FP-NEXT: vmovaps %ymm0, 128(%rcx)
941 ; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rcx)
942 ; AVX2-FP-NEXT: vmovaps %ymm9, 64(%rcx)
943 ; AVX2-FP-NEXT: vmovaps %ymm8, 96(%rcx)
944 ; AVX2-FP-NEXT: vmovaps %ymm7, 160(%rcx)
945 ; AVX2-FP-NEXT: vmovaps %ymm2, (%rcx)
946 ; AVX2-FP-NEXT: vzeroupper
949 ; AVX2-FCP-LABEL: store_i32_stride3_vf16:
951 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm1
952 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm2
953 ; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm3
954 ; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm4
955 ; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm5
956 ; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm6
957 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,0,3,3,4,4,7,7]
958 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm1[1,1,2,2]
959 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2],ymm7[3,4],ymm0[5],ymm7[6,7]
960 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm5[1,1,2,2]
961 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm0[1,2],ymm7[3],ymm0[4,5],ymm7[6],ymm0[7]
962 ; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm7 = [1,0,2,2,1,0,2,2]
963 ; AVX2-FCP-NEXT: # ymm7 = mem[0,1,0,1]
964 ; AVX2-FCP-NEXT: vpermps %ymm3, %ymm7, %ymm8
965 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
966 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5,6],ymm8[7]
967 ; AVX2-FCP-NEXT: vbroadcastsd (%rdx), %ymm8
968 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7]
969 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm8 = ymm4[0,0,3,3,4,4,7,7]
970 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm2[1,1,2,2]
971 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7]
972 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm6[1,1,2,2]
973 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7]
974 ; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm9 = [5,6,5,6,5,6,7,7]
975 ; AVX2-FCP-NEXT: vpermps %ymm4, %ymm9, %ymm10
976 ; AVX2-FCP-NEXT: vbroadcastsd 56(%rdi), %ymm11
977 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7]
978 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3]
979 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0],ymm6[1],ymm10[2,3],ymm6[4],ymm10[5,6],ymm6[7]
980 ; AVX2-FCP-NEXT: vpermps %ymm4, %ymm7, %ymm4
981 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
982 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7]
983 ; AVX2-FCP-NEXT: vbroadcastsd 32(%rdx), %ymm4
984 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7]
985 ; AVX2-FCP-NEXT: vpermps %ymm3, %ymm9, %ymm3
986 ; AVX2-FCP-NEXT: vbroadcastsd 24(%rdi), %ymm4
987 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
988 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm5[2,1,3,3]
989 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
990 ; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rcx)
991 ; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rcx)
992 ; AVX2-FCP-NEXT: vmovaps %ymm6, 160(%rcx)
993 ; AVX2-FCP-NEXT: vmovaps %ymm8, 128(%rcx)
994 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rcx)
995 ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rcx)
996 ; AVX2-FCP-NEXT: vzeroupper
997 ; AVX2-FCP-NEXT: retq
999 ; AVX512-LABEL: store_i32_stride3_vf16:
1001 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
1002 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm1
1003 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm2
1004 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5]
1005 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
1006 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
1007 ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm4
1008 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10]
1009 ; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm3
1010 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15]
1011 ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm5
1012 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0]
1013 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
1014 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31]
1015 ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm0
1016 ; AVX512-NEXT: vmovdqa64 %zmm0, 128(%rcx)
1017 ; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rcx)
1018 ; AVX512-NEXT: vmovdqa64 %zmm4, (%rcx)
1019 ; AVX512-NEXT: vzeroupper
1022 ; AVX512-FCP-LABEL: store_i32_stride3_vf16:
1023 ; AVX512-FCP: # %bb.0:
1024 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1025 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm1
1026 ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm2
1027 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5]
1028 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
1029 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
1030 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4
1031 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10]
1032 ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3
1033 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15]
1034 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5
1035 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0]
1036 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
1037 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31]
1038 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0
1039 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%rcx)
1040 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx)
1041 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rcx)
1042 ; AVX512-FCP-NEXT: vzeroupper
1043 ; AVX512-FCP-NEXT: retq
1045 ; AVX512DQ-LABEL: store_i32_stride3_vf16:
1046 ; AVX512DQ: # %bb.0:
1047 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0
1048 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm1
1049 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm2
1050 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5]
1051 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
1052 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
1053 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm4
1054 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10]
1055 ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm3
1056 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15]
1057 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm5
1058 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0]
1059 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
1060 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31]
1061 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm0
1062 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%rcx)
1063 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rcx)
1064 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rcx)
1065 ; AVX512DQ-NEXT: vzeroupper
1066 ; AVX512DQ-NEXT: retq
1068 ; AVX512DQ-FCP-LABEL: store_i32_stride3_vf16:
1069 ; AVX512DQ-FCP: # %bb.0:
1070 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1071 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm1
1072 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm2
1073 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5]
1074 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
1075 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
1076 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4
1077 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10]
1078 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3
1079 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15]
1080 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5
1081 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0]
1082 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
1083 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31]
1084 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0
1085 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%rcx)
1086 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx)
1087 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rcx)
1088 ; AVX512DQ-FCP-NEXT: vzeroupper
1089 ; AVX512DQ-FCP-NEXT: retq
1091 ; AVX512BW-LABEL: store_i32_stride3_vf16:
1092 ; AVX512BW: # %bb.0:
1093 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1094 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1
1095 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2
1096 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5]
1097 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
1098 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
1099 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm4
1100 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10]
1101 ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3
1102 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15]
1103 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm5
1104 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0]
1105 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
1106 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31]
1107 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm0
1108 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rcx)
1109 ; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rcx)
1110 ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rcx)
1111 ; AVX512BW-NEXT: vzeroupper
1112 ; AVX512BW-NEXT: retq
1114 ; AVX512BW-FCP-LABEL: store_i32_stride3_vf16:
1115 ; AVX512BW-FCP: # %bb.0:
1116 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1117 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1
1118 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2
1119 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5]
1120 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
1121 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
1122 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4
1123 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10]
1124 ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3
1125 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15]
1126 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5
1127 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0]
1128 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
1129 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31]
1130 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0
1131 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rcx)
1132 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx)
1133 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rcx)
1134 ; AVX512BW-FCP-NEXT: vzeroupper
1135 ; AVX512BW-FCP-NEXT: retq
1137 ; AVX512DQ-BW-LABEL: store_i32_stride3_vf16:
1138 ; AVX512DQ-BW: # %bb.0:
1139 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
1140 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm1
1141 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm2
1142 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5]
1143 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
1144 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
1145 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm4
1146 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10]
1147 ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3
1148 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15]
1149 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm5
1150 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0]
1151 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
1152 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31]
1153 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm0
1154 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 128(%rcx)
1155 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%rcx)
1156 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rcx)
1157 ; AVX512DQ-BW-NEXT: vzeroupper
1158 ; AVX512DQ-BW-NEXT: retq
1160 ; AVX512DQ-BW-FCP-LABEL: store_i32_stride3_vf16:
1161 ; AVX512DQ-BW-FCP: # %bb.0:
1162 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1163 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1
1164 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2
1165 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5]
1166 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
1167 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
1168 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4
1169 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10]
1170 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3
1171 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15]
1172 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5
1173 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0]
1174 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
1175 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31]
1176 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0
1177 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rcx)
1178 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx)
1179 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rcx)
1180 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
1181 ; AVX512DQ-BW-FCP-NEXT: retq
1182 %in.vec0 = load <16 x i32>, ptr %in.vecptr0, align 64
1183 %in.vec1 = load <16 x i32>, ptr %in.vecptr1, align 64
1184 %in.vec2 = load <16 x i32>, ptr %in.vecptr2, align 64
1185 %1 = shufflevector <16 x i32> %in.vec0, <16 x i32> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1186 %2 = shufflevector <16 x i32> %in.vec2, <16 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1187 %3 = shufflevector <32 x i32> %1, <32 x i32> %2, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
1188 %interleaved.vec = shufflevector <48 x i32> %3, <48 x i32> poison, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
1189 store <48 x i32> %interleaved.vec, ptr %out.vec, align 64
1193 define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind {
1194 ; SSE-LABEL: store_i32_stride3_vf32:
1196 ; SSE-NEXT: subq $152, %rsp
1197 ; SSE-NEXT: movaps (%rdi), %xmm1
1198 ; SSE-NEXT: movaps 16(%rdi), %xmm2
1199 ; SSE-NEXT: movaps 32(%rdi), %xmm3
1200 ; SSE-NEXT: movaps 48(%rdi), %xmm4
1201 ; SSE-NEXT: movaps (%rsi), %xmm10
1202 ; SSE-NEXT: movaps 16(%rsi), %xmm13
1203 ; SSE-NEXT: movaps 32(%rsi), %xmm12
1204 ; SSE-NEXT: movaps 48(%rsi), %xmm9
1205 ; SSE-NEXT: movaps (%rdx), %xmm5
1206 ; SSE-NEXT: movaps 16(%rdx), %xmm6
1207 ; SSE-NEXT: movaps 32(%rdx), %xmm7
1208 ; SSE-NEXT: movaps 48(%rdx), %xmm8
1209 ; SSE-NEXT: movaps %xmm1, %xmm0
1210 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[0,3]
1211 ; SSE-NEXT: movaps %xmm5, %xmm11
1212 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1213 ; SSE-NEXT: movaps %xmm1, %xmm5
1214 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1]
1215 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0]
1216 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1217 ; SSE-NEXT: movaps %xmm1, %xmm0
1218 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1]
1219 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm10[3,3]
1220 ; SSE-NEXT: movaps %xmm1, %xmm15
1221 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm11[1,1]
1222 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,2]
1223 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1224 ; SSE-NEXT: movaps %xmm2, %xmm0
1225 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm6[0,3]
1226 ; SSE-NEXT: movaps %xmm2, %xmm1
1227 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1]
1228 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
1229 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1230 ; SSE-NEXT: movaps %xmm2, %xmm0
1231 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1]
1232 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm13[3,3]
1233 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1234 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm6[1,1]
1235 ; SSE-NEXT: movaps %xmm6, %xmm14
1236 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm0[0,2]
1237 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1238 ; SSE-NEXT: movaps %xmm3, %xmm0
1239 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1240 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[0,3]
1241 ; SSE-NEXT: movaps %xmm3, %xmm2
1242 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1]
1243 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0]
1244 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1245 ; SSE-NEXT: movaps %xmm3, %xmm0
1246 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1]
1247 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm12[3,3]
1248 ; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill
1249 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm7[1,1]
1250 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,2]
1251 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1252 ; SSE-NEXT: movaps %xmm4, %xmm0
1253 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1254 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm8[0,3]
1255 ; SSE-NEXT: movaps %xmm4, %xmm1
1256 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1]
1257 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
1258 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1259 ; SSE-NEXT: movaps %xmm4, %xmm0
1260 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1]
1261 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm9[3,3]
1262 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1263 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm8[1,1]
1264 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[0,2]
1265 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1266 ; SSE-NEXT: movaps 64(%rdi), %xmm9
1267 ; SSE-NEXT: movaps 64(%rdx), %xmm1
1268 ; SSE-NEXT: movaps %xmm9, %xmm0
1269 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3]
1270 ; SSE-NEXT: movaps %xmm1, %xmm2
1271 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1272 ; SSE-NEXT: movaps 64(%rsi), %xmm12
1273 ; SSE-NEXT: movaps %xmm9, %xmm1
1274 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1]
1275 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
1276 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1277 ; SSE-NEXT: movaps %xmm9, %xmm0
1278 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1]
1279 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm12[3,3]
1280 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm2[1,1]
1281 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,2]
1282 ; SSE-NEXT: movaps 80(%rdi), %xmm10
1283 ; SSE-NEXT: movaps 80(%rdx), %xmm1
1284 ; SSE-NEXT: movaps %xmm10, %xmm0
1285 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3]
1286 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1287 ; SSE-NEXT: movaps 80(%rsi), %xmm8
1288 ; SSE-NEXT: movaps %xmm10, %xmm11
1289 ; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1]
1290 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,0]
1291 ; SSE-NEXT: movaps %xmm10, %xmm0
1292 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1]
1293 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,3],xmm8[3,3]
1294 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm1[1,1]
1295 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm0[0,2]
1296 ; SSE-NEXT: movaps 96(%rdi), %xmm4
1297 ; SSE-NEXT: movaps 96(%rdx), %xmm13
1298 ; SSE-NEXT: movaps %xmm4, %xmm0
1299 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm13[0,3]
1300 ; SSE-NEXT: movaps 96(%rsi), %xmm5
1301 ; SSE-NEXT: movaps %xmm4, %xmm6
1302 ; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
1303 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,0]
1304 ; SSE-NEXT: movaps %xmm4, %xmm0
1305 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1]
1306 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm5[3,3]
1307 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm13[1,1]
1308 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,2]
1309 ; SSE-NEXT: movaps 112(%rdi), %xmm0
1310 ; SSE-NEXT: movaps 112(%rdx), %xmm7
1311 ; SSE-NEXT: movaps %xmm0, %xmm2
1312 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm7[0,3]
1313 ; SSE-NEXT: movaps 112(%rsi), %xmm1
1314 ; SSE-NEXT: movaps %xmm0, %xmm3
1315 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
1316 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0]
1317 ; SSE-NEXT: movaps %xmm0, %xmm2
1318 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
1319 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3]
1320 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm7[1,1]
1321 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
1322 ; SSE-NEXT: movaps %xmm15, %xmm2
1323 ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
1324 ; SSE-NEXT: # xmm2 = xmm2[1,2],mem[2,3]
1325 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
1326 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm14[2,3]
1327 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1328 ; SSE-NEXT: movaps (%rsp), %xmm15 # 16-byte Reload
1329 ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
1330 ; SSE-NEXT: # xmm15 = xmm15[1,2],mem[2,3]
1331 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
1332 ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
1333 ; SSE-NEXT: # xmm14 = xmm14[1,2],mem[2,3]
1334 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1335 ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
1336 ; SSE-NEXT: # xmm9 = xmm9[1,2],mem[2,3]
1337 ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
1338 ; SSE-NEXT: # xmm10 = xmm10[1,2],mem[2,3]
1339 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm13[2,3]
1340 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm7[2,3]
1341 ; SSE-NEXT: movaps %xmm1, 352(%rcx)
1342 ; SSE-NEXT: movaps %xmm3, 336(%rcx)
1343 ; SSE-NEXT: movaps %xmm5, 304(%rcx)
1344 ; SSE-NEXT: movaps %xmm6, 288(%rcx)
1345 ; SSE-NEXT: movaps %xmm8, 256(%rcx)
1346 ; SSE-NEXT: movaps %xmm11, 240(%rcx)
1347 ; SSE-NEXT: movaps %xmm12, 208(%rcx)
1348 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1349 ; SSE-NEXT: movaps %xmm1, 192(%rcx)
1350 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1351 ; SSE-NEXT: movaps %xmm1, 160(%rcx)
1352 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1353 ; SSE-NEXT: movaps %xmm1, 144(%rcx)
1354 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1355 ; SSE-NEXT: movaps %xmm1, 112(%rcx)
1356 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1357 ; SSE-NEXT: movaps %xmm1, 96(%rcx)
1358 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1359 ; SSE-NEXT: movaps %xmm1, 64(%rcx)
1360 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1361 ; SSE-NEXT: movaps %xmm1, 48(%rcx)
1362 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1363 ; SSE-NEXT: movaps %xmm1, 16(%rcx)
1364 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1365 ; SSE-NEXT: movaps %xmm1, (%rcx)
1366 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
1367 ; SSE-NEXT: movaps %xmm0, 368(%rcx)
1368 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0,1,3]
1369 ; SSE-NEXT: movaps %xmm4, 320(%rcx)
1370 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0,1,3]
1371 ; SSE-NEXT: movaps %xmm10, 272(%rcx)
1372 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0,1,3]
1373 ; SSE-NEXT: movaps %xmm9, 224(%rcx)
1374 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1375 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
1376 ; SSE-NEXT: movaps %xmm0, 176(%rcx)
1377 ; SSE-NEXT: movaps %xmm15, %xmm0
1378 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[1,3]
1379 ; SSE-NEXT: movaps %xmm0, 128(%rcx)
1380 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1381 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
1382 ; SSE-NEXT: movaps %xmm0, 80(%rcx)
1383 ; SSE-NEXT: movaps %xmm2, %xmm0
1384 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,3]
1385 ; SSE-NEXT: movaps %xmm0, 32(%rcx)
1386 ; SSE-NEXT: addq $152, %rsp
1389 ; AVX-LABEL: store_i32_stride3_vf32:
1391 ; AVX-NEXT: vmovapd (%rdx), %ymm4
1392 ; AVX-NEXT: vmovapd 32(%rdx), %ymm2
1393 ; AVX-NEXT: vmovapd 64(%rdx), %ymm3
1394 ; AVX-NEXT: vmovapd 96(%rdx), %ymm0
1395 ; AVX-NEXT: vmovaps (%rsi), %xmm1
1396 ; AVX-NEXT: vmovaps 16(%rsi), %xmm7
1397 ; AVX-NEXT: vmovaps 32(%rsi), %xmm10
1398 ; AVX-NEXT: vmovaps 48(%rsi), %xmm9
1399 ; AVX-NEXT: vmovaps (%rdi), %xmm5
1400 ; AVX-NEXT: vmovaps 16(%rdi), %xmm8
1401 ; AVX-NEXT: vmovaps 32(%rdi), %xmm11
1402 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm5[1],xmm1[1]
1403 ; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm1[1,1],xmm6[0,2]
1404 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0]
1405 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[2,1]
1406 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
1407 ; AVX-NEXT: vbroadcastsd (%rdx), %ymm5
1408 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7]
1409 ; AVX-NEXT: vmovaps 80(%rsi), %xmm5
1410 ; AVX-NEXT: vmovaps 80(%rdi), %xmm6
1411 ; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm6[3,3],xmm5[3,3]
1412 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1413 ; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,1],xmm6[0,2]
1414 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm5, %ymm5
1415 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm3[2,3,2,3]
1416 ; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0,0,3,3]
1417 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7]
1418 ; AVX-NEXT: vmovaps 64(%rsi), %xmm6
1419 ; AVX-NEXT: vmovaps 64(%rdi), %xmm12
1420 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm12[1],xmm6[1]
1421 ; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm6[1,1],xmm13[0,2]
1422 ; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm12[0]
1423 ; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,0],xmm12[2,1]
1424 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6
1425 ; AVX-NEXT: vbroadcastsd 64(%rdx), %ymm12
1426 ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm12[2],ymm6[3,4],ymm12[5],ymm6[6,7]
1427 ; AVX-NEXT: vmovaps 48(%rdi), %xmm12
1428 ; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm12[3,3],xmm9[3,3]
1429 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm12[1],xmm9[1]
1430 ; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm9[1,1],xmm12[0,2]
1431 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm9, %ymm9
1432 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm2[2,3,2,3]
1433 ; AVX-NEXT: vshufpd {{.*#+}} ymm12 = ymm12[0,0,3,3]
1434 ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5,6],ymm12[7]
1435 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm11[1],xmm10[1]
1436 ; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm10[1,1],xmm12[0,2]
1437 ; AVX-NEXT: vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0]
1438 ; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm10[2,0],xmm11[2,1]
1439 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10
1440 ; AVX-NEXT: vbroadcastsd 32(%rdx), %ymm11
1441 ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7]
1442 ; AVX-NEXT: vmovaps 112(%rsi), %xmm11
1443 ; AVX-NEXT: vmovaps 112(%rdi), %xmm12
1444 ; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm12[3,3],xmm11[3,3]
1445 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1]
1446 ; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,1],xmm12[0,2]
1447 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm11, %ymm11
1448 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm0[2,3,2,3]
1449 ; AVX-NEXT: vshufpd {{.*#+}} ymm12 = ymm12[0,0,3,3]
1450 ; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6],ymm12[7]
1451 ; AVX-NEXT: vmovaps 96(%rsi), %xmm12
1452 ; AVX-NEXT: vmovaps 96(%rdi), %xmm13
1453 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm13[1],xmm12[1]
1454 ; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm12[1,1],xmm14[0,2]
1455 ; AVX-NEXT: vmovlhps {{.*#+}} xmm12 = xmm12[0],xmm13[0]
1456 ; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,0],xmm13[2,1]
1457 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm12, %ymm12
1458 ; AVX-NEXT: vbroadcastsd 96(%rdx), %ymm13
1459 ; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7]
1460 ; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm8[3,3],xmm7[3,3]
1461 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm7[1]
1462 ; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,1],xmm8[0,2]
1463 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm7, %ymm7
1464 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm4[2,3,2,3]
1465 ; AVX-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0,0,3,3]
1466 ; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6],ymm8[7]
1467 ; AVX-NEXT: vpermilps {{.*#+}} ymm8 = mem[0,0,3,3,4,4,7,7]
1468 ; AVX-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2]
1469 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7]
1470 ; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1,1,2,2]
1471 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm8[1,2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7]
1472 ; AVX-NEXT: vpermilps {{.*#+}} ymm8 = mem[0,0,3,3,4,4,7,7]
1473 ; AVX-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2]
1474 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7]
1475 ; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1,1,2,2]
1476 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm8[1,2],ymm3[3],ymm8[4,5],ymm3[6],ymm8[7]
1477 ; AVX-NEXT: vpermilps {{.*#+}} ymm8 = mem[0,0,3,3,4,4,7,7]
1478 ; AVX-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2]
1479 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7]
1480 ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1,1,2,2]
1481 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm8[1,2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7]
1482 ; AVX-NEXT: vpermilps {{.*#+}} ymm8 = mem[0,0,3,3,4,4,7,7]
1483 ; AVX-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2]
1484 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7]
1485 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
1486 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1,2],ymm0[3],ymm8[4,5],ymm0[6],ymm8[7]
1487 ; AVX-NEXT: vmovaps %ymm0, 320(%rcx)
1488 ; AVX-NEXT: vmovaps %ymm2, 128(%rcx)
1489 ; AVX-NEXT: vmovaps %ymm3, 224(%rcx)
1490 ; AVX-NEXT: vmovaps %ymm4, 32(%rcx)
1491 ; AVX-NEXT: vmovaps %ymm7, 64(%rcx)
1492 ; AVX-NEXT: vmovaps %ymm12, 288(%rcx)
1493 ; AVX-NEXT: vmovaps %ymm11, 352(%rcx)
1494 ; AVX-NEXT: vmovaps %ymm10, 96(%rcx)
1495 ; AVX-NEXT: vmovaps %ymm9, 160(%rcx)
1496 ; AVX-NEXT: vmovaps %ymm6, 192(%rcx)
1497 ; AVX-NEXT: vmovaps %ymm5, 256(%rcx)
1498 ; AVX-NEXT: vmovaps %ymm1, (%rcx)
1499 ; AVX-NEXT: vzeroupper
1502 ; AVX2-LABEL: store_i32_stride3_vf32:
1504 ; AVX2-NEXT: subq $40, %rsp
1505 ; AVX2-NEXT: vmovaps (%rdi), %ymm8
1506 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm1
1507 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm6
1508 ; AVX2-NEXT: vmovaps 32(%rsi), %ymm5
1509 ; AVX2-NEXT: vmovaps 64(%rsi), %ymm9
1510 ; AVX2-NEXT: vmovaps 96(%rsi), %ymm2
1511 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm7
1512 ; AVX2-NEXT: vmovaps 64(%rdx), %ymm10
1513 ; AVX2-NEXT: vmovaps 96(%rdx), %ymm3
1514 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2]
1515 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
1516 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm8[0,0,2,1]
1517 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3],ymm0[4],ymm4[5,6],ymm0[7]
1518 ; AVX2-NEXT: vbroadcastsd (%rdx), %ymm4
1519 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7]
1520 ; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
1521 ; AVX2-NEXT: vbroadcastsd 88(%rdi), %ymm4
1522 ; AVX2-NEXT: vshufps {{.*#+}} ymm11 = ymm9[1,2,3,3,5,6,7,7]
1523 ; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3]
1524 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm4[2],ymm11[3,4],ymm4[5],ymm11[6,7]
1525 ; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm10[2,1,3,3]
1526 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm11[1],ymm4[2,3],ymm11[4],ymm4[5,6],ymm11[7]
1527 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1528 ; AVX2-NEXT: vpermilps {{.*#+}} xmm11 = mem[1,0,2,2]
1529 ; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,0,1]
1530 ; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = ymm6[0,0,2,1]
1531 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6],ymm11[7]
1532 ; AVX2-NEXT: vbroadcastsd 64(%rdx), %ymm12
1533 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7]
1534 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1535 ; AVX2-NEXT: vbroadcastsd 56(%rdi), %ymm12
1536 ; AVX2-NEXT: vshufps {{.*#+}} ymm13 = ymm5[1,2,3,3,5,6,7,7]
1537 ; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,3]
1538 ; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7]
1539 ; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm7[2,1,3,3]
1540 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6],ymm13[7]
1541 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1542 ; AVX2-NEXT: vpermilps {{.*#+}} xmm13 = mem[1,0,2,2]
1543 ; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,1]
1544 ; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm1[0,0,2,1]
1545 ; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6],ymm13[7]
1546 ; AVX2-NEXT: vbroadcastsd 32(%rdx), %ymm14
1547 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7]
1548 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1549 ; AVX2-NEXT: vbroadcastsd 120(%rdi), %ymm14
1550 ; AVX2-NEXT: vshufps {{.*#+}} ymm15 = ymm2[1,2,3,3,5,6,7,7]
1551 ; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3]
1552 ; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7]
1553 ; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm3[2,1,3,3]
1554 ; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7]
1555 ; AVX2-NEXT: vpermilps {{.*#+}} xmm15 = mem[1,0,2,2]
1556 ; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,0,1]
1557 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm0
1558 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1]
1559 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5,6],ymm15[7]
1560 ; AVX2-NEXT: vbroadcastsd 96(%rdx), %ymm15
1561 ; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm4[0,1],ymm15[2],ymm4[3,4],ymm15[5],ymm4[6,7]
1562 ; AVX2-NEXT: vmovaps (%rsi), %ymm4
1563 ; AVX2-NEXT: vshufps {{.*#+}} ymm11 = ymm4[1,2,3,3,5,6,7,7]
1564 ; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3]
1565 ; AVX2-NEXT: vbroadcastsd 24(%rdi), %ymm12
1566 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7]
1567 ; AVX2-NEXT: vmovaps (%rdx), %ymm12
1568 ; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm12[2,1,3,3]
1569 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6],ymm13[7]
1570 ; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,3,3,4,4,7,7]
1571 ; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[1,1,2,2]
1572 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2],ymm8[3,4],ymm4[5],ymm8[6,7]
1573 ; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm12[1,1,2,2]
1574 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0],ymm4[1,2],ymm8[3],ymm4[4,5],ymm8[6],ymm4[7]
1575 ; AVX2-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,0,3,3,4,4,7,7]
1576 ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2]
1577 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2],ymm6[3,4],ymm8[5],ymm6[6,7]
1578 ; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm10[1,1,2,2]
1579 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0],ymm6[1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7]
1580 ; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7]
1581 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2]
1582 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7]
1583 ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm7[1,1,2,2]
1584 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7]
1585 ; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,3,3,4,4,7,7]
1586 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
1587 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
1588 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[1,1,2,2]
1589 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7]
1590 ; AVX2-NEXT: vmovaps %ymm0, 320(%rcx)
1591 ; AVX2-NEXT: vmovaps %ymm1, 128(%rcx)
1592 ; AVX2-NEXT: vmovaps %ymm6, 224(%rcx)
1593 ; AVX2-NEXT: vmovaps %ymm4, 32(%rcx)
1594 ; AVX2-NEXT: vmovaps %ymm11, 64(%rcx)
1595 ; AVX2-NEXT: vmovaps %ymm15, 288(%rcx)
1596 ; AVX2-NEXT: vmovaps %ymm14, 352(%rcx)
1597 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1598 ; AVX2-NEXT: vmovaps %ymm0, 96(%rcx)
1599 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1600 ; AVX2-NEXT: vmovaps %ymm0, 160(%rcx)
1601 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1602 ; AVX2-NEXT: vmovaps %ymm0, 192(%rcx)
1603 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1604 ; AVX2-NEXT: vmovaps %ymm0, 256(%rcx)
1605 ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
1606 ; AVX2-NEXT: vmovaps %ymm0, (%rcx)
1607 ; AVX2-NEXT: addq $40, %rsp
1608 ; AVX2-NEXT: vzeroupper
1611 ; AVX2-FP-LABEL: store_i32_stride3_vf32:
1613 ; AVX2-FP-NEXT: subq $40, %rsp
1614 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm8
1615 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm1
1616 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm6
1617 ; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm5
1618 ; AVX2-FP-NEXT: vmovaps 64(%rsi), %ymm9
1619 ; AVX2-FP-NEXT: vmovaps 96(%rsi), %ymm2
1620 ; AVX2-FP-NEXT: vmovaps 32(%rdx), %ymm7
1621 ; AVX2-FP-NEXT: vmovaps 64(%rdx), %ymm10
1622 ; AVX2-FP-NEXT: vmovaps 96(%rdx), %ymm3
1623 ; AVX2-FP-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2]
1624 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
1625 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm8[0,0,2,1]
1626 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3],ymm0[4],ymm4[5,6],ymm0[7]
1627 ; AVX2-FP-NEXT: vbroadcastsd (%rdx), %ymm4
1628 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7]
1629 ; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
1630 ; AVX2-FP-NEXT: vbroadcastsd 88(%rdi), %ymm4
1631 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm11 = ymm9[1,2,3,3,5,6,7,7]
1632 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3]
1633 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm4[2],ymm11[3,4],ymm4[5],ymm11[6,7]
1634 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = ymm10[2,1,3,3]
1635 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm11[1],ymm4[2,3],ymm11[4],ymm4[5,6],ymm11[7]
1636 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1637 ; AVX2-FP-NEXT: vpermilps {{.*#+}} xmm11 = mem[1,0,2,2]
1638 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,0,1]
1639 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm12 = ymm6[0,0,2,1]
1640 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6],ymm11[7]
1641 ; AVX2-FP-NEXT: vbroadcastsd 64(%rdx), %ymm12
1642 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7]
1643 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1644 ; AVX2-FP-NEXT: vbroadcastsd 56(%rdi), %ymm12
1645 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm13 = ymm5[1,2,3,3,5,6,7,7]
1646 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,3]
1647 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7]
1648 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm7[2,1,3,3]
1649 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6],ymm13[7]
1650 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1651 ; AVX2-FP-NEXT: vpermilps {{.*#+}} xmm13 = mem[1,0,2,2]
1652 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,1]
1653 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm1[0,0,2,1]
1654 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6],ymm13[7]
1655 ; AVX2-FP-NEXT: vbroadcastsd 32(%rdx), %ymm14
1656 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7]
1657 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1658 ; AVX2-FP-NEXT: vbroadcastsd 120(%rdi), %ymm14
1659 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm15 = ymm2[1,2,3,3,5,6,7,7]
1660 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3]
1661 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7]
1662 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm3[2,1,3,3]
1663 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7]
1664 ; AVX2-FP-NEXT: vpermilps {{.*#+}} xmm15 = mem[1,0,2,2]
1665 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,0,1]
1666 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm0
1667 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1]
1668 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5,6],ymm15[7]
1669 ; AVX2-FP-NEXT: vbroadcastsd 96(%rdx), %ymm15
1670 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm4[0,1],ymm15[2],ymm4[3,4],ymm15[5],ymm4[6,7]
1671 ; AVX2-FP-NEXT: vmovaps (%rsi), %ymm4
1672 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm11 = ymm4[1,2,3,3,5,6,7,7]
1673 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3]
1674 ; AVX2-FP-NEXT: vbroadcastsd 24(%rdi), %ymm12
1675 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7]
1676 ; AVX2-FP-NEXT: vmovaps (%rdx), %ymm12
1677 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm12[2,1,3,3]
1678 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6],ymm13[7]
1679 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,3,3,4,4,7,7]
1680 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[1,1,2,2]
1681 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2],ymm8[3,4],ymm4[5],ymm8[6,7]
1682 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm12[1,1,2,2]
1683 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0],ymm4[1,2],ymm8[3],ymm4[4,5],ymm8[6],ymm4[7]
1684 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,0,3,3,4,4,7,7]
1685 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2]
1686 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2],ymm6[3,4],ymm8[5],ymm6[6,7]
1687 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm10[1,1,2,2]
1688 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0],ymm6[1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7]
1689 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7]
1690 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2]
1691 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7]
1692 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm7[1,1,2,2]
1693 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7]
1694 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,3,3,4,4,7,7]
1695 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
1696 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
1697 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[1,1,2,2]
1698 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7]
1699 ; AVX2-FP-NEXT: vmovaps %ymm0, 320(%rcx)
1700 ; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rcx)
1701 ; AVX2-FP-NEXT: vmovaps %ymm6, 224(%rcx)
1702 ; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rcx)
1703 ; AVX2-FP-NEXT: vmovaps %ymm11, 64(%rcx)
1704 ; AVX2-FP-NEXT: vmovaps %ymm15, 288(%rcx)
1705 ; AVX2-FP-NEXT: vmovaps %ymm14, 352(%rcx)
1706 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1707 ; AVX2-FP-NEXT: vmovaps %ymm0, 96(%rcx)
1708 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1709 ; AVX2-FP-NEXT: vmovaps %ymm0, 160(%rcx)
1710 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1711 ; AVX2-FP-NEXT: vmovaps %ymm0, 192(%rcx)
1712 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1713 ; AVX2-FP-NEXT: vmovaps %ymm0, 256(%rcx)
1714 ; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
1715 ; AVX2-FP-NEXT: vmovaps %ymm0, (%rcx)
1716 ; AVX2-FP-NEXT: addq $40, %rsp
1717 ; AVX2-FP-NEXT: vzeroupper
1718 ; AVX2-FP-NEXT: retq
1720 ; AVX2-FCP-LABEL: store_i32_stride3_vf32:
1721 ; AVX2-FCP: # %bb.0:
1722 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm4
1723 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm7
1724 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm11
1725 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm3
1726 ; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm1
1727 ; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm10
1728 ; AVX2-FCP-NEXT: vmovaps 64(%rsi), %ymm13
1729 ; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm2
1730 ; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm12
1731 ; AVX2-FCP-NEXT: vmovaps 64(%rdx), %ymm14
1732 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0,3,3,4,4,7,7]
1733 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm4[1,1,2,2]
1734 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7]
1735 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[1,1,2,2]
1736 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7]
1737 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1738 ; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm8 = [1,0,2,2,1,0,2,2]
1739 ; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1]
1740 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm8, %ymm5
1741 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1]
1742 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7]
1743 ; AVX2-FCP-NEXT: vbroadcastsd (%rdx), %ymm5
1744 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
1745 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1746 ; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm9 = [5,6,5,6,5,6,7,7]
1747 ; AVX2-FCP-NEXT: vpermps %ymm13, %ymm9, %ymm5
1748 ; AVX2-FCP-NEXT: vbroadcastsd 88(%rdi), %ymm6
1749 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7]
1750 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm14[2,1,3,3]
1751 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7]
1752 ; AVX2-FCP-NEXT: vpermps %ymm13, %ymm8, %ymm6
1753 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm11[0,0,2,1]
1754 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0],ymm6[1],ymm15[2,3],ymm6[4],ymm15[5,6],ymm6[7]
1755 ; AVX2-FCP-NEXT: vbroadcastsd 64(%rdx), %ymm15
1756 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm15[2],ymm6[3,4],ymm15[5],ymm6[6,7]
1757 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,0,3,3,4,4,7,7]
1758 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[1,1,2,2]
1759 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7]
1760 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm13 = ymm14[1,1,2,2]
1761 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0],ymm11[1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7]
1762 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm13 = ymm10[0,0,3,3,4,4,7,7]
1763 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm7[1,1,2,2]
1764 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7]
1765 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm12[1,1,2,2]
1766 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7]
1767 ; AVX2-FCP-NEXT: vpermps %ymm10, %ymm9, %ymm14
1768 ; AVX2-FCP-NEXT: vbroadcastsd 56(%rdi), %ymm15
1769 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7]
1770 ; AVX2-FCP-NEXT: vmovaps 96(%rsi), %ymm15
1771 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,1,3,3]
1772 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5,6],ymm12[7]
1773 ; AVX2-FCP-NEXT: vpermps %ymm10, %ymm8, %ymm10
1774 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1]
1775 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3],ymm10[4],ymm7[5,6],ymm10[7]
1776 ; AVX2-FCP-NEXT: vbroadcastsd 32(%rdx), %ymm10
1777 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7]
1778 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm10 = ymm15[0,0,3,3,4,4,7,7]
1779 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm3[1,1,2,2]
1780 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2],ymm14[3,4],ymm10[5],ymm14[6,7]
1781 ; AVX2-FCP-NEXT: vmovaps 96(%rdx), %ymm14
1782 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm14[1,1,2,2]
1783 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1,2],ymm0[3],ymm10[4,5],ymm0[6],ymm10[7]
1784 ; AVX2-FCP-NEXT: vpermps %ymm15, %ymm9, %ymm10
1785 ; AVX2-FCP-NEXT: vbroadcastsd 120(%rdi), %ymm4
1786 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1],ymm4[2],ymm10[3,4],ymm4[5],ymm10[6,7]
1787 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm14[2,1,3,3]
1788 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm10[1],ymm4[2,3],ymm10[4],ymm4[5,6],ymm10[7]
1789 ; AVX2-FCP-NEXT: vpermps %ymm15, %ymm8, %ymm8
1790 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1]
1791 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2,3],ymm8[4],ymm3[5,6],ymm8[7]
1792 ; AVX2-FCP-NEXT: vbroadcastsd 96(%rdx), %ymm8
1793 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7]
1794 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1
1795 ; AVX2-FCP-NEXT: vbroadcastsd 24(%rdi), %ymm8
1796 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7]
1797 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3]
1798 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
1799 ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rcx)
1800 ; AVX2-FCP-NEXT: vmovaps %ymm3, 288(%rcx)
1801 ; AVX2-FCP-NEXT: vmovaps %ymm4, 352(%rcx)
1802 ; AVX2-FCP-NEXT: vmovaps %ymm0, 320(%rcx)
1803 ; AVX2-FCP-NEXT: vmovaps %ymm7, 96(%rcx)
1804 ; AVX2-FCP-NEXT: vmovaps %ymm12, 160(%rcx)
1805 ; AVX2-FCP-NEXT: vmovaps %ymm13, 128(%rcx)
1806 ; AVX2-FCP-NEXT: vmovaps %ymm11, 224(%rcx)
1807 ; AVX2-FCP-NEXT: vmovaps %ymm6, 192(%rcx)
1808 ; AVX2-FCP-NEXT: vmovaps %ymm5, 256(%rcx)
1809 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1810 ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rcx)
1811 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1812 ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rcx)
1813 ; AVX2-FCP-NEXT: vzeroupper
1814 ; AVX2-FCP-NEXT: retq
1816 ; AVX512-LABEL: store_i32_stride3_vf32:
1818 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
1819 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1
1820 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm2
1821 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm3
1822 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm4
1823 ; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm5
1824 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5]
1825 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7
1826 ; AVX512-NEXT: vpermt2d %zmm2, %zmm6, %zmm7
1827 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
1828 ; AVX512-NEXT: vpermt2d %zmm4, %zmm8, %zmm7
1829 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0]
1830 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10
1831 ; AVX512-NEXT: vpermt2d %zmm3, %zmm9, %zmm10
1832 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31]
1833 ; AVX512-NEXT: vpermt2d %zmm5, %zmm11, %zmm10
1834 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10]
1835 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13
1836 ; AVX512-NEXT: vpermt2d %zmm1, %zmm12, %zmm13
1837 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15]
1838 ; AVX512-NEXT: vpermt2d %zmm5, %zmm14, %zmm13
1839 ; AVX512-NEXT: vpermt2d %zmm3, %zmm6, %zmm1
1840 ; AVX512-NEXT: vpermt2d %zmm5, %zmm8, %zmm1
1841 ; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm9
1842 ; AVX512-NEXT: vpermt2d %zmm4, %zmm11, %zmm9
1843 ; AVX512-NEXT: vpermt2d %zmm0, %zmm12, %zmm2
1844 ; AVX512-NEXT: vpermt2d %zmm4, %zmm14, %zmm2
1845 ; AVX512-NEXT: vmovdqa64 %zmm2, 64(%rcx)
1846 ; AVX512-NEXT: vmovdqa64 %zmm9, 128(%rcx)
1847 ; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rcx)
1848 ; AVX512-NEXT: vmovdqa64 %zmm13, 256(%rcx)
1849 ; AVX512-NEXT: vmovdqa64 %zmm10, 320(%rcx)
1850 ; AVX512-NEXT: vmovdqa64 %zmm7, (%rcx)
1851 ; AVX512-NEXT: vzeroupper
1854 ; AVX512-FCP-LABEL: store_i32_stride3_vf32:
1855 ; AVX512-FCP: # %bb.0:
1856 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1857 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
1858 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm2
1859 ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3
1860 ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm4
1861 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5
1862 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5]
1863 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7
1864 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm7
1865 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
1866 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm7
1867 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0]
1868 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm10
1869 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm9, %zmm10
1870 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31]
1871 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm11, %zmm10
1872 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10]
1873 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm13
1874 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm12, %zmm13
1875 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15]
1876 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm14, %zmm13
1877 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm1
1878 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm1
1879 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm9
1880 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm11, %zmm9
1881 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm2
1882 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm2
1883 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%rcx)
1884 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 128(%rcx)
1885 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 192(%rcx)
1886 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 256(%rcx)
1887 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 320(%rcx)
1888 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rcx)
1889 ; AVX512-FCP-NEXT: vzeroupper
1890 ; AVX512-FCP-NEXT: retq
1892 ; AVX512DQ-LABEL: store_i32_stride3_vf32:
1893 ; AVX512DQ: # %bb.0:
1894 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0
1895 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1
1896 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm2
1897 ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm3
1898 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm4
1899 ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm5
1900 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5]
1901 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7
1902 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm6, %zmm7
1903 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
1904 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm8, %zmm7
1905 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0]
1906 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm10
1907 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm9, %zmm10
1908 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31]
1909 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm11, %zmm10
1910 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10]
1911 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm13
1912 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm12, %zmm13
1913 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15]
1914 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm14, %zmm13
1915 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm6, %zmm1
1916 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm8, %zmm1
1917 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm9
1918 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm11, %zmm9
1919 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm12, %zmm2
1920 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm14, %zmm2
1921 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rcx)
1922 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 128(%rcx)
1923 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 192(%rcx)
1924 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 256(%rcx)
1925 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 320(%rcx)
1926 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rcx)
1927 ; AVX512DQ-NEXT: vzeroupper
1928 ; AVX512DQ-NEXT: retq
1930 ; AVX512DQ-FCP-LABEL: store_i32_stride3_vf32:
1931 ; AVX512DQ-FCP: # %bb.0:
1932 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1933 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
1934 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm2
1935 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3
1936 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm4
1937 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5
1938 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5]
1939 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7
1940 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm7
1941 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
1942 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm7
1943 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0]
1944 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10
1945 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm9, %zmm10
1946 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31]
1947 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm11, %zmm10
1948 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10]
1949 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm13
1950 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm12, %zmm13
1951 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15]
1952 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm14, %zmm13
1953 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm1
1954 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm1
1955 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm9
1956 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm11, %zmm9
1957 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm2
1958 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm2
1959 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%rcx)
1960 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 128(%rcx)
1961 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 192(%rcx)
1962 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 256(%rcx)
1963 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 320(%rcx)
1964 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rcx)
1965 ; AVX512DQ-FCP-NEXT: vzeroupper
1966 ; AVX512DQ-FCP-NEXT: retq
1968 ; AVX512BW-LABEL: store_i32_stride3_vf32:
1969 ; AVX512BW: # %bb.0:
1970 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1971 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
1972 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm2
1973 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3
1974 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm4
1975 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5
1976 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5]
1977 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7
1978 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm6, %zmm7
1979 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
1980 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm7
1981 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0]
1982 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10
1983 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm9, %zmm10
1984 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31]
1985 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm11, %zmm10
1986 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10]
1987 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13
1988 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm12, %zmm13
1989 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15]
1990 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm14, %zmm13
1991 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm1
1992 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm1
1993 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm9
1994 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm11, %zmm9
1995 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm12, %zmm2
1996 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm14, %zmm2
1997 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rcx)
1998 ; AVX512BW-NEXT: vmovdqa64 %zmm9, 128(%rcx)
1999 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rcx)
2000 ; AVX512BW-NEXT: vmovdqa64 %zmm13, 256(%rcx)
2001 ; AVX512BW-NEXT: vmovdqa64 %zmm10, 320(%rcx)
2002 ; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rcx)
2003 ; AVX512BW-NEXT: vzeroupper
2004 ; AVX512BW-NEXT: retq
2006 ; AVX512BW-FCP-LABEL: store_i32_stride3_vf32:
2007 ; AVX512BW-FCP: # %bb.0:
2008 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
2009 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
2010 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm2
2011 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3
2012 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4
2013 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5
2014 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5]
2015 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7
2016 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm7
2017 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
2018 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm7
2019 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0]
2020 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10
2021 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm9, %zmm10
2022 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31]
2023 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm11, %zmm10
2024 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10]
2025 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13
2026 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm12, %zmm13
2027 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15]
2028 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm14, %zmm13
2029 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm1
2030 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm1
2031 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm9
2032 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm11, %zmm9
2033 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm2
2034 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm2
2035 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rcx)
2036 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 128(%rcx)
2037 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%rcx)
2038 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 256(%rcx)
2039 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 320(%rcx)
2040 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rcx)
2041 ; AVX512BW-FCP-NEXT: vzeroupper
2042 ; AVX512BW-FCP-NEXT: retq
2044 ; AVX512DQ-BW-LABEL: store_i32_stride3_vf32:
2045 ; AVX512DQ-BW: # %bb.0:
2046 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
2047 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1
2048 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm2
2049 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm3
2050 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm4
2051 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm5
2052 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5]
2053 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7
2054 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm6, %zmm7
2055 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
2056 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm7
2057 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0]
2058 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10
2059 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm9, %zmm10
2060 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31]
2061 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm11, %zmm10
2062 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10]
2063 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm13
2064 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm12, %zmm13
2065 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15]
2066 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm14, %zmm13
2067 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm1
2068 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm1
2069 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm9
2070 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm11, %zmm9
2071 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm12, %zmm2
2072 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm14, %zmm2
2073 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 64(%rcx)
2074 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 128(%rcx)
2075 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 192(%rcx)
2076 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 256(%rcx)
2077 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 320(%rcx)
2078 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rcx)
2079 ; AVX512DQ-BW-NEXT: vzeroupper
2080 ; AVX512DQ-BW-NEXT: retq
2082 ; AVX512DQ-BW-FCP-LABEL: store_i32_stride3_vf32:
2083 ; AVX512DQ-BW-FCP: # %bb.0:
2084 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
2085 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
2086 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm2
2087 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3
2088 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4
2089 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5
2090 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5]
2091 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7
2092 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm7
2093 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
2094 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm7
2095 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0]
2096 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10
2097 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm9, %zmm10
2098 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31]
2099 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm11, %zmm10
2100 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10]
2101 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13
2102 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm12, %zmm13
2103 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15]
2104 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm14, %zmm13
2105 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm1
2106 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm1
2107 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm9
2108 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm11, %zmm9
2109 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm2
2110 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm2
2111 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rcx)
2112 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 128(%rcx)
2113 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%rcx)
2114 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 256(%rcx)
2115 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 320(%rcx)
2116 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rcx)
2117 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
2118 ; AVX512DQ-BW-FCP-NEXT: retq
2119 %in.vec0 = load <32 x i32>, ptr %in.vecptr0, align 64
2120 %in.vec1 = load <32 x i32>, ptr %in.vecptr1, align 64
2121 %in.vec2 = load <32 x i32>, ptr %in.vecptr2, align 64
2122 %1 = shufflevector <32 x i32> %in.vec0, <32 x i32> %in.vec1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
2123 %2 = shufflevector <32 x i32> %in.vec2, <32 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2124 %3 = shufflevector <64 x i32> %1, <64 x i32> %2, <96 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
2125 %interleaved.vec = shufflevector <96 x i32> %3, <96 x i32> poison, <96 x i32> <i32 0, i32 32, i32 64, i32 1, i32 33, i32 65, i32 2, i32 34, i32 66, i32 3, i32 35, i32 67, i32 4, i32 36, i32 68, i32 5, i32 37, i32 69, i32 6, i32 38, i32 70, i32 7, i32 39, i32 71, i32 8, i32 40, i32 72, i32 9, i32 41, i32 73, i32 10, i32 42, i32 74, i32 11, i32 43, i32 75, i32 12, i32 44, i32 76, i32 13, i32 45, i32 77, i32 14, i32 46, i32 78, i32 15, i32 47, i32 79, i32 16, i32 48, i32 80, i32 17, i32 49, i32 81, i32 18, i32 50, i32 82, i32 19, i32 51, i32 83, i32 20, i32 52, i32 84, i32 21, i32 53, i32 85, i32 22, i32 54, i32 86, i32 23, i32 55, i32 87, i32 24, i32 56, i32 88, i32 25, i32 57, i32 89, i32 26, i32 58, i32 90, i32 27, i32 59, i32 91, i32 28, i32 60, i32 92, i32 29, i32 61, i32 93, i32 30, i32 62, i32 94, i32 31, i32 63, i32 95>
2126 store <96 x i32> %interleaved.vec, ptr %out.vec, align 64
2130 define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind {
2131 ; SSE-LABEL: store_i32_stride3_vf64:
2133 ; SSE-NEXT: subq $664, %rsp # imm = 0x298
2134 ; SSE-NEXT: movaps (%rdi), %xmm2
2135 ; SSE-NEXT: movaps 16(%rdi), %xmm4
2136 ; SSE-NEXT: movaps 32(%rdi), %xmm5
2137 ; SSE-NEXT: movaps 48(%rdi), %xmm6
2138 ; SSE-NEXT: movaps (%rsi), %xmm0
2139 ; SSE-NEXT: movaps 16(%rsi), %xmm11
2140 ; SSE-NEXT: movaps 32(%rsi), %xmm14
2141 ; SSE-NEXT: movaps 48(%rsi), %xmm3
2142 ; SSE-NEXT: movaps (%rdx), %xmm7
2143 ; SSE-NEXT: movaps 16(%rdx), %xmm8
2144 ; SSE-NEXT: movaps 32(%rdx), %xmm9
2145 ; SSE-NEXT: movaps 48(%rdx), %xmm10
2146 ; SSE-NEXT: movaps %xmm2, %xmm1
2147 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm7[0,3]
2148 ; SSE-NEXT: movaps %xmm7, %xmm12
2149 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2150 ; SSE-NEXT: movaps %xmm2, %xmm7
2151 ; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
2152 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0]
2153 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2154 ; SSE-NEXT: movaps %xmm2, %xmm1
2155 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2156 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[3,3]
2157 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2158 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm12[1,1]
2159 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2160 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2161 ; SSE-NEXT: movaps %xmm4, %xmm0
2162 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2163 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm8[0,3]
2164 ; SSE-NEXT: movaps %xmm4, %xmm1
2165 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1]
2166 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
2167 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2168 ; SSE-NEXT: movaps %xmm4, %xmm0
2169 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1]
2170 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm11[3,3]
2171 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2172 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm8[1,1]
2173 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2]
2174 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2175 ; SSE-NEXT: movaps %xmm5, %xmm0
2176 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2177 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[0,3]
2178 ; SSE-NEXT: movaps %xmm5, %xmm1
2179 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1]
2180 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
2181 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2182 ; SSE-NEXT: movaps %xmm5, %xmm0
2183 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1]
2184 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm14[3,3]
2185 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2186 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm9[1,1]
2187 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[0,2]
2188 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2189 ; SSE-NEXT: movaps %xmm6, %xmm0
2190 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2191 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm10[0,3]
2192 ; SSE-NEXT: movaps %xmm6, %xmm1
2193 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
2194 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
2195 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2196 ; SSE-NEXT: movaps %xmm6, %xmm0
2197 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
2198 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm3[3,3]
2199 ; SSE-NEXT: movaps %xmm6, (%rsp) # 16-byte Spill
2200 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm10[1,1]
2201 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,2]
2202 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2203 ; SSE-NEXT: movaps 64(%rdi), %xmm2
2204 ; SSE-NEXT: movaps 64(%rdx), %xmm1
2205 ; SSE-NEXT: movaps %xmm2, %xmm0
2206 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3]
2207 ; SSE-NEXT: movaps %xmm1, %xmm4
2208 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2209 ; SSE-NEXT: movaps 64(%rsi), %xmm1
2210 ; SSE-NEXT: movaps %xmm2, %xmm3
2211 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
2212 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0]
2213 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2214 ; SSE-NEXT: movaps %xmm2, %xmm0
2215 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
2216 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3]
2217 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2218 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1]
2219 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2]
2220 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2221 ; SSE-NEXT: movaps 80(%rdi), %xmm2
2222 ; SSE-NEXT: movaps 80(%rdx), %xmm1
2223 ; SSE-NEXT: movaps %xmm2, %xmm0
2224 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3]
2225 ; SSE-NEXT: movaps %xmm1, %xmm4
2226 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2227 ; SSE-NEXT: movaps 80(%rsi), %xmm1
2228 ; SSE-NEXT: movaps %xmm2, %xmm3
2229 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
2230 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0]
2231 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2232 ; SSE-NEXT: movaps %xmm2, %xmm0
2233 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
2234 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3]
2235 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2236 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1]
2237 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2]
2238 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2239 ; SSE-NEXT: movaps 96(%rdi), %xmm2
2240 ; SSE-NEXT: movaps 96(%rdx), %xmm1
2241 ; SSE-NEXT: movaps %xmm2, %xmm0
2242 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3]
2243 ; SSE-NEXT: movaps %xmm1, %xmm4
2244 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2245 ; SSE-NEXT: movaps 96(%rsi), %xmm1
2246 ; SSE-NEXT: movaps %xmm2, %xmm3
2247 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
2248 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0]
2249 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2250 ; SSE-NEXT: movaps %xmm2, %xmm0
2251 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
2252 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3]
2253 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2254 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1]
2255 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2]
2256 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2257 ; SSE-NEXT: movaps 112(%rdi), %xmm2
2258 ; SSE-NEXT: movaps 112(%rdx), %xmm1
2259 ; SSE-NEXT: movaps %xmm2, %xmm0
2260 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3]
2261 ; SSE-NEXT: movaps %xmm1, %xmm4
2262 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2263 ; SSE-NEXT: movaps 112(%rsi), %xmm1
2264 ; SSE-NEXT: movaps %xmm2, %xmm3
2265 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
2266 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0]
2267 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2268 ; SSE-NEXT: movaps %xmm2, %xmm0
2269 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
2270 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3]
2271 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2272 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1]
2273 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2]
2274 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2275 ; SSE-NEXT: movaps 128(%rdi), %xmm2
2276 ; SSE-NEXT: movaps 128(%rdx), %xmm1
2277 ; SSE-NEXT: movaps %xmm2, %xmm0
2278 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3]
2279 ; SSE-NEXT: movaps %xmm1, %xmm4
2280 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2281 ; SSE-NEXT: movaps 128(%rsi), %xmm1
2282 ; SSE-NEXT: movaps %xmm2, %xmm3
2283 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
2284 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0]
2285 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2286 ; SSE-NEXT: movaps %xmm2, %xmm0
2287 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
2288 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3]
2289 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2290 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1]
2291 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2]
2292 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2293 ; SSE-NEXT: movaps 144(%rdi), %xmm2
2294 ; SSE-NEXT: movaps 144(%rdx), %xmm1
2295 ; SSE-NEXT: movaps %xmm2, %xmm0
2296 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3]
2297 ; SSE-NEXT: movaps %xmm1, %xmm4
2298 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2299 ; SSE-NEXT: movaps 144(%rsi), %xmm1
2300 ; SSE-NEXT: movaps %xmm2, %xmm3
2301 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
2302 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0]
2303 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2304 ; SSE-NEXT: movaps %xmm2, %xmm0
2305 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
2306 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3]
2307 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2308 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1]
2309 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2]
2310 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2311 ; SSE-NEXT: movaps 160(%rdi), %xmm14
2312 ; SSE-NEXT: movaps 160(%rdx), %xmm1
2313 ; SSE-NEXT: movaps %xmm14, %xmm0
2314 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3]
2315 ; SSE-NEXT: movaps %xmm1, %xmm3
2316 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2317 ; SSE-NEXT: movaps 160(%rsi), %xmm1
2318 ; SSE-NEXT: movaps %xmm14, %xmm2
2319 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2320 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0]
2321 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2322 ; SSE-NEXT: movaps %xmm14, %xmm0
2323 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
2324 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,3],xmm1[3,3]
2325 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1]
2326 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2]
2327 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2328 ; SSE-NEXT: movaps 176(%rdi), %xmm12
2329 ; SSE-NEXT: movaps 176(%rdx), %xmm1
2330 ; SSE-NEXT: movaps %xmm12, %xmm0
2331 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3]
2332 ; SSE-NEXT: movaps %xmm1, %xmm3
2333 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2334 ; SSE-NEXT: movaps 176(%rsi), %xmm1
2335 ; SSE-NEXT: movaps %xmm12, %xmm2
2336 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2337 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0]
2338 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2339 ; SSE-NEXT: movaps %xmm12, %xmm0
2340 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
2341 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,3],xmm1[3,3]
2342 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1]
2343 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2]
2344 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2345 ; SSE-NEXT: movaps 192(%rdi), %xmm13
2346 ; SSE-NEXT: movaps 192(%rdx), %xmm1
2347 ; SSE-NEXT: movaps %xmm13, %xmm0
2348 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3]
2349 ; SSE-NEXT: movaps %xmm1, %xmm2
2350 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2351 ; SSE-NEXT: movaps 192(%rsi), %xmm11
2352 ; SSE-NEXT: movaps %xmm13, %xmm1
2353 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1]
2354 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
2355 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2356 ; SSE-NEXT: movaps %xmm13, %xmm0
2357 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1]
2358 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,3],xmm11[3,3]
2359 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm2[1,1]
2360 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2]
2361 ; SSE-NEXT: movaps 208(%rdi), %xmm6
2362 ; SSE-NEXT: movaps 208(%rdx), %xmm1
2363 ; SSE-NEXT: movaps %xmm6, %xmm0
2364 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3]
2365 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2366 ; SSE-NEXT: movaps 208(%rsi), %xmm8
2367 ; SSE-NEXT: movaps %xmm6, %xmm10
2368 ; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
2369 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0]
2370 ; SSE-NEXT: movaps %xmm6, %xmm0
2371 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1]
2372 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm8[3,3]
2373 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm1[1,1]
2374 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm0[0,2]
2375 ; SSE-NEXT: movaps 224(%rdi), %xmm5
2376 ; SSE-NEXT: movaps 224(%rdx), %xmm15
2377 ; SSE-NEXT: movaps %xmm5, %xmm0
2378 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm15[0,3]
2379 ; SSE-NEXT: movaps 224(%rsi), %xmm4
2380 ; SSE-NEXT: movaps %xmm5, %xmm7
2381 ; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
2382 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[2,0]
2383 ; SSE-NEXT: movaps %xmm5, %xmm0
2384 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1]
2385 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm4[3,3]
2386 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm15[1,1]
2387 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,2]
2388 ; SSE-NEXT: movaps 240(%rdi), %xmm2
2389 ; SSE-NEXT: movaps 240(%rdx), %xmm9
2390 ; SSE-NEXT: movaps %xmm2, %xmm1
2391 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm9[0,3]
2392 ; SSE-NEXT: movaps 240(%rsi), %xmm0
2393 ; SSE-NEXT: movaps %xmm2, %xmm3
2394 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
2395 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0]
2396 ; SSE-NEXT: movaps %xmm2, %xmm1
2397 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2398 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[3,3]
2399 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1]
2400 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2401 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2402 ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2403 ; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3]
2404 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2405 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2406 ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2407 ; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3]
2408 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2409 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2410 ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2411 ; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3]
2412 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2413 ; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
2414 ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2415 ; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3]
2416 ; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
2417 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2418 ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2419 ; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3]
2420 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2421 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2422 ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2423 ; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3]
2424 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2425 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2426 ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2427 ; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3]
2428 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2429 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2430 ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2431 ; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3]
2432 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2433 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2434 ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2435 ; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3]
2436 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2437 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2438 ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2439 ; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3]
2440 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2441 ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
2442 ; SSE-NEXT: # xmm14 = xmm14[1,2],mem[2,3]
2443 ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
2444 ; SSE-NEXT: # xmm12 = xmm12[1,2],mem[2,3]
2445 ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
2446 ; SSE-NEXT: # xmm13 = xmm13[1,2],mem[2,3]
2447 ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
2448 ; SSE-NEXT: # xmm6 = xmm6[1,2],mem[2,3]
2449 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,2],xmm15[2,3]
2450 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm9[2,3]
2451 ; SSE-NEXT: movaps %xmm0, 736(%rcx)
2452 ; SSE-NEXT: movaps %xmm3, 720(%rcx)
2453 ; SSE-NEXT: movaps %xmm4, 688(%rcx)
2454 ; SSE-NEXT: movaps %xmm7, 672(%rcx)
2455 ; SSE-NEXT: movaps %xmm8, 640(%rcx)
2456 ; SSE-NEXT: movaps %xmm10, 624(%rcx)
2457 ; SSE-NEXT: movaps %xmm11, 592(%rcx)
2458 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2459 ; SSE-NEXT: movaps %xmm0, 576(%rcx)
2460 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2461 ; SSE-NEXT: movaps %xmm0, 544(%rcx)
2462 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2463 ; SSE-NEXT: movaps %xmm0, 528(%rcx)
2464 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2465 ; SSE-NEXT: movaps %xmm0, 496(%rcx)
2466 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2467 ; SSE-NEXT: movaps %xmm0, 480(%rcx)
2468 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2469 ; SSE-NEXT: movaps %xmm0, 448(%rcx)
2470 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2471 ; SSE-NEXT: movaps %xmm0, 432(%rcx)
2472 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2473 ; SSE-NEXT: movaps %xmm0, 400(%rcx)
2474 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2475 ; SSE-NEXT: movaps %xmm0, 384(%rcx)
2476 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2477 ; SSE-NEXT: movaps %xmm0, 352(%rcx)
2478 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2479 ; SSE-NEXT: movaps %xmm0, 336(%rcx)
2480 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2481 ; SSE-NEXT: movaps %xmm0, 304(%rcx)
2482 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2483 ; SSE-NEXT: movaps %xmm0, 288(%rcx)
2484 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2485 ; SSE-NEXT: movaps %xmm0, 256(%rcx)
2486 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2487 ; SSE-NEXT: movaps %xmm0, 240(%rcx)
2488 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2489 ; SSE-NEXT: movaps %xmm0, 208(%rcx)
2490 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2491 ; SSE-NEXT: movaps %xmm0, 192(%rcx)
2492 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2493 ; SSE-NEXT: movaps %xmm0, 160(%rcx)
2494 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2495 ; SSE-NEXT: movaps %xmm0, 144(%rcx)
2496 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2497 ; SSE-NEXT: movaps %xmm0, 112(%rcx)
2498 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2499 ; SSE-NEXT: movaps %xmm0, 96(%rcx)
2500 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2501 ; SSE-NEXT: movaps %xmm0, 64(%rcx)
2502 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2503 ; SSE-NEXT: movaps %xmm0, 48(%rcx)
2504 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2505 ; SSE-NEXT: movaps %xmm0, 16(%rcx)
2506 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2507 ; SSE-NEXT: movaps %xmm0, (%rcx)
2508 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0,1,3]
2509 ; SSE-NEXT: movaps %xmm2, 752(%rcx)
2510 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0,1,3]
2511 ; SSE-NEXT: movaps %xmm5, 704(%rcx)
2512 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0,1,3]
2513 ; SSE-NEXT: movaps %xmm6, 656(%rcx)
2514 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0,1,3]
2515 ; SSE-NEXT: movaps %xmm13, 608(%rcx)
2516 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0,1,3]
2517 ; SSE-NEXT: movaps %xmm12, 560(%rcx)
2518 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0,1,3]
2519 ; SSE-NEXT: movaps %xmm14, 512(%rcx)
2520 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2521 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
2522 ; SSE-NEXT: movaps %xmm0, 464(%rcx)
2523 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2524 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
2525 ; SSE-NEXT: movaps %xmm0, 416(%rcx)
2526 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2527 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
2528 ; SSE-NEXT: movaps %xmm0, 368(%rcx)
2529 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2530 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
2531 ; SSE-NEXT: movaps %xmm0, 320(%rcx)
2532 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2533 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
2534 ; SSE-NEXT: movaps %xmm0, 272(%rcx)
2535 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2536 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
2537 ; SSE-NEXT: movaps %xmm0, 224(%rcx)
2538 ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
2539 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
2540 ; SSE-NEXT: movaps %xmm0, 176(%rcx)
2541 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2542 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
2543 ; SSE-NEXT: movaps %xmm0, 128(%rcx)
2544 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2545 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
2546 ; SSE-NEXT: movaps %xmm0, 80(%rcx)
2547 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2548 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
2549 ; SSE-NEXT: movaps %xmm0, 32(%rcx)
2550 ; SSE-NEXT: addq $664, %rsp # imm = 0x298
2553 ; AVX-LABEL: store_i32_stride3_vf64:
2555 ; AVX-NEXT: subq $200, %rsp
2556 ; AVX-NEXT: vmovapd (%rdx), %ymm8
2557 ; AVX-NEXT: vmovapd 32(%rdx), %ymm9
2558 ; AVX-NEXT: vmovapd 64(%rdx), %ymm11
2559 ; AVX-NEXT: vmovapd 96(%rdx), %ymm12
2560 ; AVX-NEXT: vmovaps (%rsi), %xmm0
2561 ; AVX-NEXT: vmovaps 16(%rsi), %xmm1
2562 ; AVX-NEXT: vmovaps 32(%rsi), %xmm2
2563 ; AVX-NEXT: vmovaps 48(%rsi), %xmm3
2564 ; AVX-NEXT: vmovaps (%rdi), %xmm4
2565 ; AVX-NEXT: vmovaps 16(%rdi), %xmm5
2566 ; AVX-NEXT: vmovaps 32(%rdi), %xmm6
2567 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm4[1],xmm0[1]
2568 ; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm0[1,1],xmm7[0,2]
2569 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
2570 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[2,1]
2571 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0
2572 ; AVX-NEXT: vbroadcastsd (%rdx), %ymm4
2573 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7]
2574 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2575 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm1[3,3]
2576 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm1[1]
2577 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[0,2]
2578 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2579 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3,2,3]
2580 ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,3]
2581 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
2582 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2583 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm6[1],xmm2[1]
2584 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[1,1],xmm0[0,2]
2585 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm6[0]
2586 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm6[2,1]
2587 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2588 ; AVX-NEXT: vbroadcastsd 32(%rdx), %ymm1
2589 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
2590 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2591 ; AVX-NEXT: vmovaps 48(%rdi), %xmm0
2592 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3],xmm3[3,3]
2593 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
2594 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm0[0,2]
2595 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2596 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3,2,3]
2597 ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,3]
2598 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
2599 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2600 ; AVX-NEXT: vmovaps 64(%rsi), %xmm0
2601 ; AVX-NEXT: vmovaps 64(%rdi), %xmm1
2602 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
2603 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,1],xmm2[0,2]
2604 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2605 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1]
2606 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2607 ; AVX-NEXT: vbroadcastsd 64(%rdx), %ymm1
2608 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
2609 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2610 ; AVX-NEXT: vmovaps 80(%rsi), %xmm0
2611 ; AVX-NEXT: vmovaps 80(%rdi), %xmm1
2612 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3],xmm0[3,3]
2613 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2614 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,2]
2615 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2616 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3,2,3]
2617 ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,3]
2618 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
2619 ; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
2620 ; AVX-NEXT: vmovaps 96(%rsi), %xmm0
2621 ; AVX-NEXT: vmovaps 96(%rdi), %xmm1
2622 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
2623 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,1],xmm2[0,2]
2624 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2625 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1]
2626 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2627 ; AVX-NEXT: vbroadcastsd 96(%rdx), %ymm1
2628 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
2629 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2630 ; AVX-NEXT: vmovaps 112(%rsi), %xmm0
2631 ; AVX-NEXT: vmovaps 112(%rdi), %xmm1
2632 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3],xmm0[3,3]
2633 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2634 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,2]
2635 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2636 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3,2,3]
2637 ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,3]
2638 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
2639 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2640 ; AVX-NEXT: vmovaps 128(%rsi), %xmm0
2641 ; AVX-NEXT: vmovaps 128(%rdi), %xmm1
2642 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
2643 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,1],xmm2[0,2]
2644 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2645 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1]
2646 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2647 ; AVX-NEXT: vbroadcastsd 128(%rdx), %ymm1
2648 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
2649 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2650 ; AVX-NEXT: vmovaps 144(%rsi), %xmm0
2651 ; AVX-NEXT: vmovaps 144(%rdi), %xmm1
2652 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3],xmm0[3,3]
2653 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2654 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,2]
2655 ; AVX-NEXT: vmovapd 128(%rdx), %ymm6
2656 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2657 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3,2,3]
2658 ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,3]
2659 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
2660 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2661 ; AVX-NEXT: vmovaps 160(%rsi), %xmm0
2662 ; AVX-NEXT: vmovaps 160(%rdi), %xmm1
2663 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
2664 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,1],xmm2[0,2]
2665 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2666 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1]
2667 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2668 ; AVX-NEXT: vbroadcastsd 160(%rdx), %ymm1
2669 ; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
2670 ; AVX-NEXT: vmovaps 176(%rsi), %xmm0
2671 ; AVX-NEXT: vmovaps 176(%rdi), %xmm1
2672 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3],xmm0[3,3]
2673 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2674 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,2]
2675 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2676 ; AVX-NEXT: vmovapd 160(%rdx), %ymm4
2677 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3,2,3]
2678 ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,3]
2679 ; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
2680 ; AVX-NEXT: vmovaps 192(%rsi), %xmm0
2681 ; AVX-NEXT: vmovaps 192(%rdi), %xmm1
2682 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm1[1],xmm0[1]
2683 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm0[1,1],xmm3[0,2]
2684 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2685 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1]
2686 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
2687 ; AVX-NEXT: vbroadcastsd 192(%rdx), %ymm1
2688 ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
2689 ; AVX-NEXT: vmovaps 208(%rsi), %xmm0
2690 ; AVX-NEXT: vmovaps 208(%rdi), %xmm1
2691 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3],xmm0[3,3]
2692 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2693 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,2]
2694 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
2695 ; AVX-NEXT: vmovapd 192(%rdx), %ymm2
2696 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[2,3,2,3]
2697 ; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0,0,3,3]
2698 ; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7]
2699 ; AVX-NEXT: vmovaps 224(%rsi), %xmm0
2700 ; AVX-NEXT: vmovaps 224(%rdi), %xmm3
2701 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm3[1],xmm0[1]
2702 ; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm0[1,1],xmm5[0,2]
2703 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
2704 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[2,1]
2705 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
2706 ; AVX-NEXT: vbroadcastsd 224(%rdx), %ymm3
2707 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7]
2708 ; AVX-NEXT: vmovaps 240(%rsi), %xmm0
2709 ; AVX-NEXT: vmovaps 240(%rdi), %xmm3
2710 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3],xmm0[3,3]
2711 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
2712 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[0,2]
2713 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
2714 ; AVX-NEXT: vmovapd 224(%rdx), %ymm0
2715 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
2716 ; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0,0,3,3]
2717 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7]
2718 ; AVX-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,0,3,3,4,4,7,7]
2719 ; AVX-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2]
2720 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm1[2],ymm13[3,4],ymm1[5],ymm13[6,7]
2721 ; AVX-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[1,1,2,2]
2722 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm1[1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7]
2723 ; AVX-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,0,3,3,4,4,7,7]
2724 ; AVX-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2]
2725 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm1[2],ymm13[3,4],ymm1[5],ymm13[6,7]
2726 ; AVX-NEXT: vshufpd {{.*#+}} ymm9 = ymm9[1,1,2,2]
2727 ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm1[1,2],ymm9[3],ymm1[4,5],ymm9[6],ymm1[7]
2728 ; AVX-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,0,3,3,4,4,7,7]
2729 ; AVX-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2]
2730 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm1[2],ymm13[3,4],ymm1[5],ymm13[6,7]
2731 ; AVX-NEXT: vshufpd {{.*#+}} ymm11 = ymm11[1,1,2,2]
2732 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0],ymm1[1,2],ymm11[3],ymm1[4,5],ymm11[6],ymm1[7]
2733 ; AVX-NEXT: vpermilps {{.*#+}} ymm11 = mem[0,0,3,3,4,4,7,7]
2734 ; AVX-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2]
2735 ; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1],ymm11[2],ymm13[3,4],ymm11[5],ymm13[6,7]
2736 ; AVX-NEXT: vshufpd {{.*#+}} ymm12 = ymm12[1,1,2,2]
2737 ; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0],ymm11[1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7]
2738 ; AVX-NEXT: vpermilps {{.*#+}} ymm12 = mem[0,0,3,3,4,4,7,7]
2739 ; AVX-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2]
2740 ; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7]
2741 ; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1,1,2,2]
2742 ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm12[1,2],ymm6[3],ymm12[4,5],ymm6[6],ymm12[7]
2743 ; AVX-NEXT: vpermilps {{.*#+}} ymm12 = mem[0,0,3,3,4,4,7,7]
2744 ; AVX-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2]
2745 ; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7]
2746 ; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1,1,2,2]
2747 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7]
2748 ; AVX-NEXT: vpermilps {{.*#+}} ymm12 = mem[0,0,3,3,4,4,7,7]
2749 ; AVX-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2]
2750 ; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7]
2751 ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1,1,2,2]
2752 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm12[1,2],ymm2[3],ymm12[4,5],ymm2[6],ymm12[7]
2753 ; AVX-NEXT: vpermilps {{.*#+}} ymm12 = mem[0,0,3,3,4,4,7,7]
2754 ; AVX-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2]
2755 ; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7]
2756 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
2757 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1,2],ymm0[3],ymm12[4,5],ymm0[6],ymm12[7]
2758 ; AVX-NEXT: vmovaps %ymm0, 704(%rcx)
2759 ; AVX-NEXT: vmovaps %ymm2, 608(%rcx)
2760 ; AVX-NEXT: vmovaps %ymm4, 512(%rcx)
2761 ; AVX-NEXT: vmovaps %ymm6, 416(%rcx)
2762 ; AVX-NEXT: vmovaps %ymm11, 320(%rcx)
2763 ; AVX-NEXT: vmovaps %ymm1, 224(%rcx)
2764 ; AVX-NEXT: vmovaps %ymm9, 128(%rcx)
2765 ; AVX-NEXT: vmovaps %ymm8, 32(%rcx)
2766 ; AVX-NEXT: vmovaps %ymm3, 736(%rcx)
2767 ; AVX-NEXT: vmovaps %ymm5, 672(%rcx)
2768 ; AVX-NEXT: vmovaps %ymm7, 640(%rcx)
2769 ; AVX-NEXT: vmovaps %ymm10, 576(%rcx)
2770 ; AVX-NEXT: vmovaps %ymm15, 544(%rcx)
2771 ; AVX-NEXT: vmovaps %ymm14, 480(%rcx)
2772 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2773 ; AVX-NEXT: vmovaps %ymm0, 448(%rcx)
2774 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2775 ; AVX-NEXT: vmovaps %ymm0, 384(%rcx)
2776 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2777 ; AVX-NEXT: vmovaps %ymm0, 352(%rcx)
2778 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2779 ; AVX-NEXT: vmovaps %ymm0, 288(%rcx)
2780 ; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2781 ; AVX-NEXT: vmovaps %ymm0, 256(%rcx)
2782 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2783 ; AVX-NEXT: vmovaps %ymm0, 192(%rcx)
2784 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2785 ; AVX-NEXT: vmovaps %ymm0, 160(%rcx)
2786 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2787 ; AVX-NEXT: vmovaps %ymm0, 96(%rcx)
2788 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2789 ; AVX-NEXT: vmovaps %ymm0, 64(%rcx)
2790 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2791 ; AVX-NEXT: vmovaps %ymm0, (%rcx)
2792 ; AVX-NEXT: addq $200, %rsp
2793 ; AVX-NEXT: vzeroupper
2796 ; AVX2-LABEL: store_i32_stride3_vf64:
2798 ; AVX2-NEXT: subq $712, %rsp # imm = 0x2C8
2799 ; AVX2-NEXT: vmovaps (%rdi), %ymm15
2800 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm8
2801 ; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2802 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm13
2803 ; AVX2-NEXT: vmovaps (%rsi), %ymm5
2804 ; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2805 ; AVX2-NEXT: vmovaps 32(%rsi), %ymm3
2806 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2807 ; AVX2-NEXT: vmovaps 64(%rsi), %ymm2
2808 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2809 ; AVX2-NEXT: vmovaps (%rdx), %ymm7
2810 ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2811 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm6
2812 ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2813 ; AVX2-NEXT: vmovaps 64(%rdx), %ymm4
2814 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2815 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2]
2816 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
2817 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm15[0,0,2,1]
2818 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
2819 ; AVX2-NEXT: vbroadcastsd (%rdx), %ymm1
2820 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
2821 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2822 ; AVX2-NEXT: vbroadcastsd 24(%rdi), %ymm0
2823 ; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,2,3,3,5,6,7,7]
2824 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
2825 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
2826 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm7[2,1,3,3]
2827 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
2828 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2829 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2]
2830 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
2831 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[0,0,2,1]
2832 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
2833 ; AVX2-NEXT: vbroadcastsd 32(%rdx), %ymm1
2834 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
2835 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2836 ; AVX2-NEXT: vbroadcastsd 56(%rdi), %ymm0
2837 ; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,2,3,3,5,6,7,7]
2838 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
2839 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
2840 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,1,3,3]
2841 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
2842 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2843 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2]
2844 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
2845 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[0,0,2,1]
2846 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
2847 ; AVX2-NEXT: vbroadcastsd 64(%rdx), %ymm1
2848 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
2849 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2850 ; AVX2-NEXT: vbroadcastsd 88(%rdi), %ymm0
2851 ; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,2,3,3,5,6,7,7]
2852 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
2853 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
2854 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm4[2,1,3,3]
2855 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
2856 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2857 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2]
2858 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
2859 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm1
2860 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2861 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
2862 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
2863 ; AVX2-NEXT: vbroadcastsd 96(%rdx), %ymm1
2864 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
2865 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2866 ; AVX2-NEXT: vmovaps 96(%rsi), %ymm14
2867 ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,2,3,3,5,6,7,7]
2868 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
2869 ; AVX2-NEXT: vbroadcastsd 120(%rdi), %ymm1
2870 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
2871 ; AVX2-NEXT: vmovaps 96(%rdx), %ymm1
2872 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2873 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3]
2874 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
2875 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2876 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2]
2877 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
2878 ; AVX2-NEXT: vmovaps 128(%rdi), %ymm1
2879 ; AVX2-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill
2880 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
2881 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
2882 ; AVX2-NEXT: vbroadcastsd 128(%rdx), %ymm1
2883 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
2884 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2885 ; AVX2-NEXT: vmovaps 128(%rsi), %ymm12
2886 ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,2,3,3,5,6,7,7]
2887 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
2888 ; AVX2-NEXT: vbroadcastsd 152(%rdi), %ymm1
2889 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
2890 ; AVX2-NEXT: vmovaps 128(%rdx), %ymm10
2891 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm10[2,1,3,3]
2892 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
2893 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2894 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2]
2895 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
2896 ; AVX2-NEXT: vmovaps 160(%rdi), %ymm9
2897 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm9[0,0,2,1]
2898 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
2899 ; AVX2-NEXT: vbroadcastsd 160(%rdx), %ymm1
2900 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
2901 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2902 ; AVX2-NEXT: vmovaps 160(%rsi), %ymm7
2903 ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,2,3,3,5,6,7,7]
2904 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
2905 ; AVX2-NEXT: vbroadcastsd 184(%rdi), %ymm1
2906 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
2907 ; AVX2-NEXT: vmovaps 160(%rdx), %ymm6
2908 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,1,3,3]
2909 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
2910 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2911 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2]
2912 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
2913 ; AVX2-NEXT: vmovaps 192(%rdi), %ymm5
2914 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm5[0,0,2,1]
2915 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
2916 ; AVX2-NEXT: vbroadcastsd 192(%rdx), %ymm1
2917 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
2918 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2919 ; AVX2-NEXT: vmovaps 192(%rsi), %ymm4
2920 ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,2,3,3,5,6,7,7]
2921 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
2922 ; AVX2-NEXT: vbroadcastsd 216(%rdi), %ymm1
2923 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
2924 ; AVX2-NEXT: vmovaps 192(%rdx), %ymm3
2925 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[2,1,3,3]
2926 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
2927 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2928 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2]
2929 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
2930 ; AVX2-NEXT: vmovaps 224(%rdi), %ymm2
2931 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[0,0,2,1]
2932 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
2933 ; AVX2-NEXT: vbroadcastsd 224(%rdx), %ymm1
2934 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
2935 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2936 ; AVX2-NEXT: vmovaps 224(%rsi), %ymm1
2937 ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,2,3,3,5,6,7,7]
2938 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
2939 ; AVX2-NEXT: vbroadcastsd 248(%rdi), %ymm8
2940 ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7]
2941 ; AVX2-NEXT: vmovaps 224(%rdx), %ymm0
2942 ; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm0[2,1,3,3]
2943 ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm11[1],ymm8[2,3],ymm11[4],ymm8[5,6],ymm11[7]
2944 ; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2945 ; AVX2-NEXT: vpermilps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
2946 ; AVX2-NEXT: # ymm8 = mem[0,0,3,3,4,4,7,7]
2947 ; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm15[1,1,2,2]
2948 ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7]
2949 ; AVX2-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
2950 ; AVX2-NEXT: # ymm11 = mem[1,1,2,2]
2951 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm8[1,2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7]
2952 ; AVX2-NEXT: vpermilps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
2953 ; AVX2-NEXT: # ymm8 = mem[0,0,3,3,4,4,7,7]
2954 ; AVX2-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
2955 ; AVX2-NEXT: # ymm15 = mem[1,1,2,2]
2956 ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm8[2],ymm15[3,4],ymm8[5],ymm15[6,7]
2957 ; AVX2-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
2958 ; AVX2-NEXT: # ymm15 = mem[1,1,2,2]
2959 ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0],ymm8[1,2],ymm15[3],ymm8[4,5],ymm15[6],ymm8[7]
2960 ; AVX2-NEXT: vpermilps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
2961 ; AVX2-NEXT: # ymm15 = mem[0,0,3,3,4,4,7,7]
2962 ; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[1,1,2,2]
2963 ; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7]
2964 ; AVX2-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
2965 ; AVX2-NEXT: # ymm15 = mem[1,1,2,2]
2966 ; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7]
2967 ; AVX2-NEXT: vshufps {{.*#+}} ymm15 = ymm14[0,0,3,3,4,4,7,7]
2968 ; AVX2-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
2969 ; AVX2-NEXT: # ymm14 = mem[1,1,2,2]
2970 ; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7]
2971 ; AVX2-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
2972 ; AVX2-NEXT: # ymm15 = mem[1,1,2,2]
2973 ; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7]
2974 ; AVX2-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,0,3,3,4,4,7,7]
2975 ; AVX2-NEXT: vpermpd $165, (%rsp), %ymm15 # 32-byte Folded Reload
2976 ; AVX2-NEXT: # ymm15 = mem[1,1,2,2]
2977 ; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7]
2978 ; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[1,1,2,2]
2979 ; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm12[1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7]
2980 ; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,3,3,4,4,7,7]
2981 ; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[1,1,2,2]
2982 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3,4],ymm7[5],ymm9[6,7]
2983 ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2]
2984 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7]
2985 ; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,3,3,4,4,7,7]
2986 ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[1,1,2,2]
2987 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
2988 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[1,1,2,2]
2989 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
2990 ; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
2991 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2]
2992 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
2993 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
2994 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
2995 ; AVX2-NEXT: vmovaps %ymm0, 704(%rcx)
2996 ; AVX2-NEXT: vmovaps %ymm3, 608(%rcx)
2997 ; AVX2-NEXT: vmovaps %ymm6, 512(%rcx)
2998 ; AVX2-NEXT: vmovaps %ymm10, 416(%rcx)
2999 ; AVX2-NEXT: vmovaps %ymm14, 320(%rcx)
3000 ; AVX2-NEXT: vmovaps %ymm13, 224(%rcx)
3001 ; AVX2-NEXT: vmovaps %ymm8, 128(%rcx)
3002 ; AVX2-NEXT: vmovaps %ymm11, 32(%rcx)
3003 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3004 ; AVX2-NEXT: vmovaps %ymm0, 736(%rcx)
3005 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3006 ; AVX2-NEXT: vmovaps %ymm0, 672(%rcx)
3007 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3008 ; AVX2-NEXT: vmovaps %ymm0, 640(%rcx)
3009 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3010 ; AVX2-NEXT: vmovaps %ymm0, 576(%rcx)
3011 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3012 ; AVX2-NEXT: vmovaps %ymm0, 544(%rcx)
3013 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3014 ; AVX2-NEXT: vmovaps %ymm0, 480(%rcx)
3015 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3016 ; AVX2-NEXT: vmovaps %ymm0, 448(%rcx)
3017 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3018 ; AVX2-NEXT: vmovaps %ymm0, 384(%rcx)
3019 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3020 ; AVX2-NEXT: vmovaps %ymm0, 352(%rcx)
3021 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3022 ; AVX2-NEXT: vmovaps %ymm0, 288(%rcx)
3023 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3024 ; AVX2-NEXT: vmovaps %ymm0, 256(%rcx)
3025 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3026 ; AVX2-NEXT: vmovaps %ymm0, 192(%rcx)
3027 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3028 ; AVX2-NEXT: vmovaps %ymm0, 160(%rcx)
3029 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3030 ; AVX2-NEXT: vmovaps %ymm0, 96(%rcx)
3031 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3032 ; AVX2-NEXT: vmovaps %ymm0, 64(%rcx)
3033 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3034 ; AVX2-NEXT: vmovaps %ymm0, (%rcx)
3035 ; AVX2-NEXT: addq $712, %rsp # imm = 0x2C8
3036 ; AVX2-NEXT: vzeroupper
3039 ; AVX2-FP-LABEL: store_i32_stride3_vf64:
3041 ; AVX2-FP-NEXT: subq $712, %rsp # imm = 0x2C8
3042 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm15
3043 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm8
3044 ; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3045 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm13
3046 ; AVX2-FP-NEXT: vmovaps (%rsi), %ymm5
3047 ; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3048 ; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm3
3049 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3050 ; AVX2-FP-NEXT: vmovaps 64(%rsi), %ymm2
3051 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3052 ; AVX2-FP-NEXT: vmovaps (%rdx), %ymm7
3053 ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3054 ; AVX2-FP-NEXT: vmovaps 32(%rdx), %ymm6
3055 ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3056 ; AVX2-FP-NEXT: vmovaps 64(%rdx), %ymm4
3057 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3058 ; AVX2-FP-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2]
3059 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
3060 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm15[0,0,2,1]
3061 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
3062 ; AVX2-FP-NEXT: vbroadcastsd (%rdx), %ymm1
3063 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
3064 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3065 ; AVX2-FP-NEXT: vbroadcastsd 24(%rdi), %ymm0
3066 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,2,3,3,5,6,7,7]
3067 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
3068 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
3069 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm7[2,1,3,3]
3070 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
3071 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3072 ; AVX2-FP-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2]
3073 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
3074 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[0,0,2,1]
3075 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
3076 ; AVX2-FP-NEXT: vbroadcastsd 32(%rdx), %ymm1
3077 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
3078 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3079 ; AVX2-FP-NEXT: vbroadcastsd 56(%rdi), %ymm0
3080 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,2,3,3,5,6,7,7]
3081 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
3082 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
3083 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,1,3,3]
3084 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
3085 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3086 ; AVX2-FP-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2]
3087 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
3088 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[0,0,2,1]
3089 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
3090 ; AVX2-FP-NEXT: vbroadcastsd 64(%rdx), %ymm1
3091 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
3092 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3093 ; AVX2-FP-NEXT: vbroadcastsd 88(%rdi), %ymm0
3094 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,2,3,3,5,6,7,7]
3095 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
3096 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
3097 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm4[2,1,3,3]
3098 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
3099 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3100 ; AVX2-FP-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2]
3101 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
3102 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm1
3103 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3104 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
3105 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
3106 ; AVX2-FP-NEXT: vbroadcastsd 96(%rdx), %ymm1
3107 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
3108 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3109 ; AVX2-FP-NEXT: vmovaps 96(%rsi), %ymm14
3110 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,2,3,3,5,6,7,7]
3111 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
3112 ; AVX2-FP-NEXT: vbroadcastsd 120(%rdi), %ymm1
3113 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
3114 ; AVX2-FP-NEXT: vmovaps 96(%rdx), %ymm1
3115 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3116 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3]
3117 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
3118 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3119 ; AVX2-FP-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2]
3120 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
3121 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm1
3122 ; AVX2-FP-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill
3123 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
3124 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
3125 ; AVX2-FP-NEXT: vbroadcastsd 128(%rdx), %ymm1
3126 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
3127 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3128 ; AVX2-FP-NEXT: vmovaps 128(%rsi), %ymm12
3129 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,2,3,3,5,6,7,7]
3130 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
3131 ; AVX2-FP-NEXT: vbroadcastsd 152(%rdi), %ymm1
3132 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
3133 ; AVX2-FP-NEXT: vmovaps 128(%rdx), %ymm10
3134 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm10[2,1,3,3]
3135 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
3136 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3137 ; AVX2-FP-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2]
3138 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
3139 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm9
3140 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm9[0,0,2,1]
3141 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
3142 ; AVX2-FP-NEXT: vbroadcastsd 160(%rdx), %ymm1
3143 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
3144 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3145 ; AVX2-FP-NEXT: vmovaps 160(%rsi), %ymm7
3146 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,2,3,3,5,6,7,7]
3147 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
3148 ; AVX2-FP-NEXT: vbroadcastsd 184(%rdi), %ymm1
3149 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
3150 ; AVX2-FP-NEXT: vmovaps 160(%rdx), %ymm6
3151 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,1,3,3]
3152 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
3153 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3154 ; AVX2-FP-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2]
3155 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
3156 ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm5
3157 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm5[0,0,2,1]
3158 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
3159 ; AVX2-FP-NEXT: vbroadcastsd 192(%rdx), %ymm1
3160 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
3161 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3162 ; AVX2-FP-NEXT: vmovaps 192(%rsi), %ymm4
3163 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,2,3,3,5,6,7,7]
3164 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
3165 ; AVX2-FP-NEXT: vbroadcastsd 216(%rdi), %ymm1
3166 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
3167 ; AVX2-FP-NEXT: vmovaps 192(%rdx), %ymm3
3168 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[2,1,3,3]
3169 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
3170 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3171 ; AVX2-FP-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2]
3172 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
3173 ; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm2
3174 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[0,0,2,1]
3175 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
3176 ; AVX2-FP-NEXT: vbroadcastsd 224(%rdx), %ymm1
3177 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
3178 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3179 ; AVX2-FP-NEXT: vmovaps 224(%rsi), %ymm1
3180 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,2,3,3,5,6,7,7]
3181 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
3182 ; AVX2-FP-NEXT: vbroadcastsd 248(%rdi), %ymm8
3183 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7]
3184 ; AVX2-FP-NEXT: vmovaps 224(%rdx), %ymm0
3185 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = ymm0[2,1,3,3]
3186 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm11[1],ymm8[2,3],ymm11[4],ymm8[5,6],ymm11[7]
3187 ; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3188 ; AVX2-FP-NEXT: vpermilps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
3189 ; AVX2-FP-NEXT: # ymm8 = mem[0,0,3,3,4,4,7,7]
3190 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = ymm15[1,1,2,2]
3191 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7]
3192 ; AVX2-FP-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
3193 ; AVX2-FP-NEXT: # ymm11 = mem[1,1,2,2]
3194 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm8[1,2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7]
3195 ; AVX2-FP-NEXT: vpermilps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
3196 ; AVX2-FP-NEXT: # ymm8 = mem[0,0,3,3,4,4,7,7]
3197 ; AVX2-FP-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
3198 ; AVX2-FP-NEXT: # ymm15 = mem[1,1,2,2]
3199 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm8[2],ymm15[3,4],ymm8[5],ymm15[6,7]
3200 ; AVX2-FP-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
3201 ; AVX2-FP-NEXT: # ymm15 = mem[1,1,2,2]
3202 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0],ymm8[1,2],ymm15[3],ymm8[4,5],ymm15[6],ymm8[7]
3203 ; AVX2-FP-NEXT: vpermilps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
3204 ; AVX2-FP-NEXT: # ymm15 = mem[0,0,3,3,4,4,7,7]
3205 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[1,1,2,2]
3206 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7]
3207 ; AVX2-FP-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
3208 ; AVX2-FP-NEXT: # ymm15 = mem[1,1,2,2]
3209 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7]
3210 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm15 = ymm14[0,0,3,3,4,4,7,7]
3211 ; AVX2-FP-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
3212 ; AVX2-FP-NEXT: # ymm14 = mem[1,1,2,2]
3213 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7]
3214 ; AVX2-FP-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
3215 ; AVX2-FP-NEXT: # ymm15 = mem[1,1,2,2]
3216 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7]
3217 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,0,3,3,4,4,7,7]
3218 ; AVX2-FP-NEXT: vpermpd $165, (%rsp), %ymm15 # 32-byte Folded Reload
3219 ; AVX2-FP-NEXT: # ymm15 = mem[1,1,2,2]
3220 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7]
3221 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[1,1,2,2]
3222 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm12[1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7]
3223 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,3,3,4,4,7,7]
3224 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[1,1,2,2]
3225 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3,4],ymm7[5],ymm9[6,7]
3226 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2]
3227 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7]
3228 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,3,3,4,4,7,7]
3229 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[1,1,2,2]
3230 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
3231 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[1,1,2,2]
3232 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
3233 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
3234 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2]
3235 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
3236 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
3237 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
3238 ; AVX2-FP-NEXT: vmovaps %ymm0, 704(%rcx)
3239 ; AVX2-FP-NEXT: vmovaps %ymm3, 608(%rcx)
3240 ; AVX2-FP-NEXT: vmovaps %ymm6, 512(%rcx)
3241 ; AVX2-FP-NEXT: vmovaps %ymm10, 416(%rcx)
3242 ; AVX2-FP-NEXT: vmovaps %ymm14, 320(%rcx)
3243 ; AVX2-FP-NEXT: vmovaps %ymm13, 224(%rcx)
3244 ; AVX2-FP-NEXT: vmovaps %ymm8, 128(%rcx)
3245 ; AVX2-FP-NEXT: vmovaps %ymm11, 32(%rcx)
3246 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3247 ; AVX2-FP-NEXT: vmovaps %ymm0, 736(%rcx)
3248 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3249 ; AVX2-FP-NEXT: vmovaps %ymm0, 672(%rcx)
3250 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3251 ; AVX2-FP-NEXT: vmovaps %ymm0, 640(%rcx)
3252 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3253 ; AVX2-FP-NEXT: vmovaps %ymm0, 576(%rcx)
3254 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3255 ; AVX2-FP-NEXT: vmovaps %ymm0, 544(%rcx)
3256 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3257 ; AVX2-FP-NEXT: vmovaps %ymm0, 480(%rcx)
3258 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3259 ; AVX2-FP-NEXT: vmovaps %ymm0, 448(%rcx)
3260 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3261 ; AVX2-FP-NEXT: vmovaps %ymm0, 384(%rcx)
3262 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3263 ; AVX2-FP-NEXT: vmovaps %ymm0, 352(%rcx)
3264 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3265 ; AVX2-FP-NEXT: vmovaps %ymm0, 288(%rcx)
3266 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3267 ; AVX2-FP-NEXT: vmovaps %ymm0, 256(%rcx)
3268 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3269 ; AVX2-FP-NEXT: vmovaps %ymm0, 192(%rcx)
3270 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3271 ; AVX2-FP-NEXT: vmovaps %ymm0, 160(%rcx)
3272 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3273 ; AVX2-FP-NEXT: vmovaps %ymm0, 96(%rcx)
3274 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3275 ; AVX2-FP-NEXT: vmovaps %ymm0, 64(%rcx)
3276 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3277 ; AVX2-FP-NEXT: vmovaps %ymm0, (%rcx)
3278 ; AVX2-FP-NEXT: addq $712, %rsp # imm = 0x2C8
3279 ; AVX2-FP-NEXT: vzeroupper
3280 ; AVX2-FP-NEXT: retq
3282 ; AVX2-FCP-LABEL: store_i32_stride3_vf64:
3283 ; AVX2-FCP: # %bb.0:
3284 ; AVX2-FCP-NEXT: subq $232, %rsp
3285 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm6
3286 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm3
3287 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm1
3288 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm12
3289 ; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm8
3290 ; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm5
3291 ; AVX2-FCP-NEXT: vmovaps 64(%rsi), %ymm10
3292 ; AVX2-FCP-NEXT: vmovaps 96(%rsi), %ymm0
3293 ; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm9
3294 ; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm4
3295 ; AVX2-FCP-NEXT: vmovaps 64(%rdx), %ymm2
3296 ; AVX2-FCP-NEXT: vmovaps 96(%rdx), %ymm13
3297 ; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm7 = [1,0,2,2,1,0,2,2]
3298 ; AVX2-FCP-NEXT: # ymm7 = mem[0,1,0,1]
3299 ; AVX2-FCP-NEXT: vpermps %ymm8, %ymm7, %ymm11
3300 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm6[0,0,2,1]
3301 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3],ymm11[4],ymm14[5,6],ymm11[7]
3302 ; AVX2-FCP-NEXT: vbroadcastsd (%rdx), %ymm14
3303 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7]
3304 ; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3305 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm11 = ymm8[0,0,3,3,4,4,7,7]
3306 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2]
3307 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm11[2],ymm6[3,4],ymm11[5],ymm6[6,7]
3308 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm9[1,1,2,2]
3309 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0],ymm6[1,2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7]
3310 ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3311 ; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm6 = [5,6,5,6,5,6,7,7]
3312 ; AVX2-FCP-NEXT: vpermps %ymm8, %ymm6, %ymm8
3313 ; AVX2-FCP-NEXT: vbroadcastsd 24(%rdi), %ymm11
3314 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7]
3315 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3]
3316 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7]
3317 ; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3318 ; AVX2-FCP-NEXT: vpermps %ymm5, %ymm7, %ymm8
3319 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm3[0,0,2,1]
3320 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6],ymm8[7]
3321 ; AVX2-FCP-NEXT: vbroadcastsd 32(%rdx), %ymm9
3322 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
3323 ; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3324 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm8 = ymm5[0,0,3,3,4,4,7,7]
3325 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[1,1,2,2]
3326 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7]
3327 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm4[1,1,2,2]
3328 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0],ymm3[1,2],ymm8[3],ymm3[4,5],ymm8[6],ymm3[7]
3329 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3330 ; AVX2-FCP-NEXT: vpermps %ymm5, %ymm6, %ymm3
3331 ; AVX2-FCP-NEXT: vbroadcastsd 56(%rdi), %ymm5
3332 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7]
3333 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3]
3334 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
3335 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3336 ; AVX2-FCP-NEXT: vpermps %ymm10, %ymm7, %ymm3
3337 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[0,0,2,1]
3338 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
3339 ; AVX2-FCP-NEXT: vbroadcastsd 64(%rdx), %ymm4
3340 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
3341 ; AVX2-FCP-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill
3342 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,0,3,3,4,4,7,7]
3343 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2]
3344 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7]
3345 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm2[1,1,2,2]
3346 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7]
3347 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3348 ; AVX2-FCP-NEXT: vpermps %ymm10, %ymm6, %ymm1
3349 ; AVX2-FCP-NEXT: vbroadcastsd 88(%rdi), %ymm3
3350 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7]
3351 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3]
3352 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
3353 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3354 ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm7, %ymm1
3355 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm12[0,0,2,1]
3356 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
3357 ; AVX2-FCP-NEXT: vbroadcastsd 96(%rdx), %ymm2
3358 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
3359 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3360 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,0,3,3,4,4,7,7]
3361 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm12[1,1,2,2]
3362 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
3363 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm13[1,1,2,2]
3364 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
3365 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3366 ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm6, %ymm0
3367 ; AVX2-FCP-NEXT: vbroadcastsd 120(%rdi), %ymm1
3368 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
3369 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm1
3370 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm13[2,1,3,3]
3371 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
3372 ; AVX2-FCP-NEXT: vmovaps 128(%rsi), %ymm0
3373 ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm7, %ymm2
3374 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[0,0,2,1]
3375 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
3376 ; AVX2-FCP-NEXT: vbroadcastsd 128(%rdx), %ymm3
3377 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
3378 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2]
3379 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,0,3,3,4,4,7,7]
3380 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
3381 ; AVX2-FCP-NEXT: vmovaps 128(%rdx), %ymm2
3382 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm2[1,1,2,2]
3383 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7]
3384 ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm6, %ymm0
3385 ; AVX2-FCP-NEXT: vbroadcastsd 152(%rdi), %ymm1
3386 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
3387 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[2,1,3,3]
3388 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
3389 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm0
3390 ; AVX2-FCP-NEXT: vmovaps 160(%rsi), %ymm1
3391 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm7, %ymm2
3392 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,0,2,1]
3393 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
3394 ; AVX2-FCP-NEXT: vbroadcastsd 160(%rdx), %ymm3
3395 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
3396 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
3397 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,0,3,3,4,4,7,7]
3398 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
3399 ; AVX2-FCP-NEXT: vmovaps 160(%rdx), %ymm0
3400 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[1,1,2,2]
3401 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0],ymm2[1,2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7]
3402 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm6, %ymm1
3403 ; AVX2-FCP-NEXT: vbroadcastsd 184(%rdi), %ymm5
3404 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7]
3405 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
3406 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
3407 ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm5
3408 ; AVX2-FCP-NEXT: vmovaps 192(%rsi), %ymm0
3409 ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm7, %ymm1
3410 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm5[0,0,2,1]
3411 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0],ymm1[1],ymm9[2,3],ymm1[4],ymm9[5,6],ymm1[7]
3412 ; AVX2-FCP-NEXT: vbroadcastsd 192(%rdx), %ymm9
3413 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2],ymm1[3,4],ymm9[5],ymm1[6,7]
3414 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[1,1,2,2]
3415 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm9 = ymm0[0,0,3,3,4,4,7,7]
3416 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm9[2],ymm5[3,4],ymm9[5],ymm5[6,7]
3417 ; AVX2-FCP-NEXT: vmovaps 192(%rdx), %ymm9
3418 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm9[1,1,2,2]
3419 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0],ymm5[1,2],ymm10[3],ymm5[4,5],ymm10[6],ymm5[7]
3420 ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm6, %ymm0
3421 ; AVX2-FCP-NEXT: vbroadcastsd 216(%rdi), %ymm10
3422 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm10[2],ymm0[3,4],ymm10[5],ymm0[6,7]
3423 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3]
3424 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6],ymm9[7]
3425 ; AVX2-FCP-NEXT: vmovaps 224(%rsi), %ymm9
3426 ; AVX2-FCP-NEXT: vpermps %ymm9, %ymm7, %ymm7
3427 ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm10
3428 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm10[0,0,2,1]
3429 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6],ymm7[7]
3430 ; AVX2-FCP-NEXT: vbroadcastsd 224(%rdx), %ymm11
3431 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm11[2],ymm7[3,4],ymm11[5],ymm7[6,7]
3432 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[1,1,2,2]
3433 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm11 = ymm9[0,0,3,3,4,4,7,7]
3434 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7]
3435 ; AVX2-FCP-NEXT: vmovaps 224(%rdx), %ymm11
3436 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[1,1,2,2]
3437 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0],ymm10[1,2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7]
3438 ; AVX2-FCP-NEXT: vpermps %ymm9, %ymm6, %ymm6
3439 ; AVX2-FCP-NEXT: vbroadcastsd 248(%rdi), %ymm9
3440 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7]
3441 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm11[2,1,3,3]
3442 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm9[1],ymm6[2,3],ymm9[4],ymm6[5,6],ymm9[7]
3443 ; AVX2-FCP-NEXT: vmovaps %ymm6, 736(%rcx)
3444 ; AVX2-FCP-NEXT: vmovaps %ymm10, 704(%rcx)
3445 ; AVX2-FCP-NEXT: vmovaps %ymm7, 672(%rcx)
3446 ; AVX2-FCP-NEXT: vmovaps %ymm0, 640(%rcx)
3447 ; AVX2-FCP-NEXT: vmovaps %ymm5, 608(%rcx)
3448 ; AVX2-FCP-NEXT: vmovaps %ymm1, 576(%rcx)
3449 ; AVX2-FCP-NEXT: vmovaps %ymm2, 544(%rcx)
3450 ; AVX2-FCP-NEXT: vmovaps %ymm3, 512(%rcx)
3451 ; AVX2-FCP-NEXT: vmovaps %ymm4, 480(%rcx)
3452 ; AVX2-FCP-NEXT: vmovaps %ymm8, 448(%rcx)
3453 ; AVX2-FCP-NEXT: vmovaps %ymm15, 416(%rcx)
3454 ; AVX2-FCP-NEXT: vmovaps %ymm14, 384(%rcx)
3455 ; AVX2-FCP-NEXT: vmovaps %ymm13, 352(%rcx)
3456 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3457 ; AVX2-FCP-NEXT: vmovaps %ymm0, 320(%rcx)
3458 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3459 ; AVX2-FCP-NEXT: vmovaps %ymm0, 288(%rcx)
3460 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3461 ; AVX2-FCP-NEXT: vmovaps %ymm0, 256(%rcx)
3462 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3463 ; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%rcx)
3464 ; AVX2-FCP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
3465 ; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%rcx)
3466 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3467 ; AVX2-FCP-NEXT: vmovaps %ymm0, 160(%rcx)
3468 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3469 ; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%rcx)
3470 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3471 ; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%rcx)
3472 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3473 ; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rcx)
3474 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3475 ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rcx)
3476 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3477 ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rcx)
3478 ; AVX2-FCP-NEXT: addq $232, %rsp
3479 ; AVX2-FCP-NEXT: vzeroupper
3480 ; AVX2-FCP-NEXT: retq
3482 ; AVX512-LABEL: store_i32_stride3_vf64:
3484 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm3
3485 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm2
3486 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm1
3487 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm0
3488 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm4
3489 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm5
3490 ; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm6
3491 ; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm7
3492 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm8
3493 ; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm9
3494 ; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm10
3495 ; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm11
3496 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5]
3497 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13
3498 ; AVX512-NEXT: vpermt2d %zmm4, %zmm12, %zmm13
3499 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
3500 ; AVX512-NEXT: vpermt2d %zmm8, %zmm14, %zmm13
3501 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0]
3502 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm16
3503 ; AVX512-NEXT: vpermt2d %zmm7, %zmm15, %zmm16
3504 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31]
3505 ; AVX512-NEXT: vpermt2d %zmm11, %zmm17, %zmm16
3506 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10]
3507 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm19
3508 ; AVX512-NEXT: vpermt2d %zmm0, %zmm18, %zmm19
3509 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15]
3510 ; AVX512-NEXT: vpermt2d %zmm11, %zmm20, %zmm19
3511 ; AVX512-NEXT: vpermt2d %zmm7, %zmm12, %zmm0
3512 ; AVX512-NEXT: vpermt2d %zmm11, %zmm14, %zmm0
3513 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7
3514 ; AVX512-NEXT: vpermt2d %zmm6, %zmm15, %zmm7
3515 ; AVX512-NEXT: vpermt2d %zmm10, %zmm17, %zmm7
3516 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm11
3517 ; AVX512-NEXT: vpermt2d %zmm1, %zmm18, %zmm11
3518 ; AVX512-NEXT: vpermt2d %zmm10, %zmm20, %zmm11
3519 ; AVX512-NEXT: vpermt2d %zmm6, %zmm12, %zmm1
3520 ; AVX512-NEXT: vpermt2d %zmm10, %zmm14, %zmm1
3521 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm6
3522 ; AVX512-NEXT: vpermt2d %zmm5, %zmm15, %zmm6
3523 ; AVX512-NEXT: vpermt2d %zmm9, %zmm17, %zmm6
3524 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm10
3525 ; AVX512-NEXT: vpermt2d %zmm2, %zmm18, %zmm10
3526 ; AVX512-NEXT: vpermt2d %zmm9, %zmm20, %zmm10
3527 ; AVX512-NEXT: vpermt2d %zmm5, %zmm12, %zmm2
3528 ; AVX512-NEXT: vpermt2d %zmm9, %zmm14, %zmm2
3529 ; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm15
3530 ; AVX512-NEXT: vpermt2d %zmm8, %zmm17, %zmm15
3531 ; AVX512-NEXT: vpermt2d %zmm3, %zmm18, %zmm4
3532 ; AVX512-NEXT: vpermt2d %zmm8, %zmm20, %zmm4
3533 ; AVX512-NEXT: vmovdqa64 %zmm4, 64(%rcx)
3534 ; AVX512-NEXT: vmovdqa64 %zmm15, 128(%rcx)
3535 ; AVX512-NEXT: vmovdqa64 %zmm2, 192(%rcx)
3536 ; AVX512-NEXT: vmovdqa64 %zmm10, 256(%rcx)
3537 ; AVX512-NEXT: vmovdqa64 %zmm6, 320(%rcx)
3538 ; AVX512-NEXT: vmovdqa64 %zmm1, 384(%rcx)
3539 ; AVX512-NEXT: vmovdqa64 %zmm11, 448(%rcx)
3540 ; AVX512-NEXT: vmovdqa64 %zmm7, 512(%rcx)
3541 ; AVX512-NEXT: vmovdqa64 %zmm0, 576(%rcx)
3542 ; AVX512-NEXT: vmovdqa64 %zmm19, 640(%rcx)
3543 ; AVX512-NEXT: vmovdqa64 %zmm16, 704(%rcx)
3544 ; AVX512-NEXT: vmovdqa64 %zmm13, (%rcx)
3545 ; AVX512-NEXT: vzeroupper
3548 ; AVX512-FCP-LABEL: store_i32_stride3_vf64:
3549 ; AVX512-FCP: # %bb.0:
3550 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm3
3551 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
3552 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
3553 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm0
3554 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm4
3555 ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5
3556 ; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm6
3557 ; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm7
3558 ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm8
3559 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm9
3560 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %zmm10
3561 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdx), %zmm11
3562 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5]
3563 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm13
3564 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm13
3565 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
3566 ; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm14, %zmm13
3567 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0]
3568 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm16
3569 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm16
3570 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31]
3571 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm17, %zmm16
3572 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10]
3573 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm19
3574 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm19
3575 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15]
3576 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm20, %zmm19
3577 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm0
3578 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm0
3579 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7
3580 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm15, %zmm7
3581 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm17, %zmm7
3582 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm11
3583 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm11
3584 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm20, %zmm11
3585 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm1
3586 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm14, %zmm1
3587 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm6
3588 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm15, %zmm6
3589 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm17, %zmm6
3590 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm10
3591 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm18, %zmm10
3592 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm20, %zmm10
3593 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm12, %zmm2
3594 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm14, %zmm2
3595 ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm15
3596 ; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm17, %zmm15
3597 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm18, %zmm4
3598 ; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm20, %zmm4
3599 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%rcx)
3600 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 128(%rcx)
3601 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 192(%rcx)
3602 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 256(%rcx)
3603 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 320(%rcx)
3604 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 384(%rcx)
3605 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 448(%rcx)
3606 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 512(%rcx)
3607 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 576(%rcx)
3608 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 640(%rcx)
3609 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 704(%rcx)
3610 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, (%rcx)
3611 ; AVX512-FCP-NEXT: vzeroupper
3612 ; AVX512-FCP-NEXT: retq
3614 ; AVX512DQ-LABEL: store_i32_stride3_vf64:
3615 ; AVX512DQ: # %bb.0:
3616 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm3
3617 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm2
3618 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm1
3619 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm0
3620 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm4
3621 ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm5
3622 ; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm6
3623 ; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm7
3624 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm8
3625 ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm9
3626 ; AVX512DQ-NEXT: vmovdqa64 128(%rdx), %zmm10
3627 ; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm11
3628 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5]
3629 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm13
3630 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm12, %zmm13
3631 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
3632 ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm14, %zmm13
3633 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0]
3634 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm16
3635 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm15, %zmm16
3636 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31]
3637 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm17, %zmm16
3638 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10]
3639 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm19
3640 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm18, %zmm19
3641 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15]
3642 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm20, %zmm19
3643 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm12, %zmm0
3644 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm14, %zmm0
3645 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm7
3646 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm15, %zmm7
3647 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm17, %zmm7
3648 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm11
3649 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm18, %zmm11
3650 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm20, %zmm11
3651 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm12, %zmm1
3652 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm14, %zmm1
3653 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm6
3654 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm15, %zmm6
3655 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm17, %zmm6
3656 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm10
3657 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm18, %zmm10
3658 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm20, %zmm10
3659 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm12, %zmm2
3660 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm14, %zmm2
3661 ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm3, %zmm15
3662 ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm17, %zmm15
3663 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm18, %zmm4
3664 ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm20, %zmm4
3665 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rcx)
3666 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, 128(%rcx)
3667 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rcx)
3668 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 256(%rcx)
3669 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 320(%rcx)
3670 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 384(%rcx)
3671 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 448(%rcx)
3672 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 512(%rcx)
3673 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 576(%rcx)
3674 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, 640(%rcx)
3675 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 704(%rcx)
3676 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, (%rcx)
3677 ; AVX512DQ-NEXT: vzeroupper
3678 ; AVX512DQ-NEXT: retq
3680 ; AVX512DQ-FCP-LABEL: store_i32_stride3_vf64:
3681 ; AVX512DQ-FCP: # %bb.0:
3682 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm3
3683 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
3684 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
3685 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm0
3686 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm4
3687 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5
3688 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm6
3689 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm7
3690 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm8
3691 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm9
3692 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdx), %zmm10
3693 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdx), %zmm11
3694 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5]
3695 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm13
3696 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm13
3697 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
3698 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm14, %zmm13
3699 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0]
3700 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm16
3701 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm16
3702 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31]
3703 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm17, %zmm16
3704 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10]
3705 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm19
3706 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm19
3707 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15]
3708 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm20, %zmm19
3709 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm0
3710 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm0
3711 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm7
3712 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm15, %zmm7
3713 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm17, %zmm7
3714 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm11
3715 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm11
3716 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm20, %zmm11
3717 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm1
3718 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm14, %zmm1
3719 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm6
3720 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm15, %zmm6
3721 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm17, %zmm6
3722 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm10
3723 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm18, %zmm10
3724 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm20, %zmm10
3725 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm12, %zmm2
3726 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm14, %zmm2
3727 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm15
3728 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm17, %zmm15
3729 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm18, %zmm4
3730 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm20, %zmm4
3731 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 64(%rcx)
3732 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 128(%rcx)
3733 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 192(%rcx)
3734 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 256(%rcx)
3735 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 320(%rcx)
3736 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 384(%rcx)
3737 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 448(%rcx)
3738 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 512(%rcx)
3739 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 576(%rcx)
3740 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 640(%rcx)
3741 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 704(%rcx)
3742 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, (%rcx)
3743 ; AVX512DQ-FCP-NEXT: vzeroupper
3744 ; AVX512DQ-FCP-NEXT: retq
3746 ; AVX512BW-LABEL: store_i32_stride3_vf64:
3747 ; AVX512BW: # %bb.0:
3748 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3
3749 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2
3750 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1
3751 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm0
3752 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm4
3753 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm5
3754 ; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm6
3755 ; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm7
3756 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm8
3757 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm9
3758 ; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm10
3759 ; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm11
3760 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5]
3761 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13
3762 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm12, %zmm13
3763 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
3764 ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm14, %zmm13
3765 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0]
3766 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16
3767 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm15, %zmm16
3768 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31]
3769 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm17, %zmm16
3770 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10]
3771 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm19
3772 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm19
3773 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15]
3774 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm20, %zmm19
3775 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm0
3776 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm14, %zmm0
3777 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7
3778 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm15, %zmm7
3779 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm17, %zmm7
3780 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11
3781 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm18, %zmm11
3782 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm20, %zmm11
3783 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm12, %zmm1
3784 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm14, %zmm1
3785 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6
3786 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm15, %zmm6
3787 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm17, %zmm6
3788 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10
3789 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm18, %zmm10
3790 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm20, %zmm10
3791 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm12, %zmm2
3792 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm14, %zmm2
3793 ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm15
3794 ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm17, %zmm15
3795 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm18, %zmm4
3796 ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm20, %zmm4
3797 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rcx)
3798 ; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%rcx)
3799 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rcx)
3800 ; AVX512BW-NEXT: vmovdqa64 %zmm10, 256(%rcx)
3801 ; AVX512BW-NEXT: vmovdqa64 %zmm6, 320(%rcx)
3802 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 384(%rcx)
3803 ; AVX512BW-NEXT: vmovdqa64 %zmm11, 448(%rcx)
3804 ; AVX512BW-NEXT: vmovdqa64 %zmm7, 512(%rcx)
3805 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 576(%rcx)
3806 ; AVX512BW-NEXT: vmovdqa64 %zmm19, 640(%rcx)
3807 ; AVX512BW-NEXT: vmovdqa64 %zmm16, 704(%rcx)
3808 ; AVX512BW-NEXT: vmovdqa64 %zmm13, (%rcx)
3809 ; AVX512BW-NEXT: vzeroupper
3810 ; AVX512BW-NEXT: retq
3812 ; AVX512BW-FCP-LABEL: store_i32_stride3_vf64:
3813 ; AVX512BW-FCP: # %bb.0:
3814 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3
3815 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
3816 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
3817 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm0
3818 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm4
3819 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5
3820 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm6
3821 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm7
3822 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm8
3823 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm9
3824 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm10
3825 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm11
3826 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5]
3827 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13
3828 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm13
3829 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
3830 ; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm14, %zmm13
3831 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0]
3832 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16
3833 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm16
3834 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31]
3835 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm17, %zmm16
3836 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10]
3837 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm19
3838 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm19
3839 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15]
3840 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm20, %zmm19
3841 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm0
3842 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm0
3843 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7
3844 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm15, %zmm7
3845 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm17, %zmm7
3846 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11
3847 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm11
3848 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm20, %zmm11
3849 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm1
3850 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm14, %zmm1
3851 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6
3852 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm15, %zmm6
3853 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm17, %zmm6
3854 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10
3855 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm18, %zmm10
3856 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm20, %zmm10
3857 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm12, %zmm2
3858 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm14, %zmm2
3859 ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm15
3860 ; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm17, %zmm15
3861 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm18, %zmm4
3862 ; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm20, %zmm4
3863 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rcx)
3864 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 128(%rcx)
3865 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%rcx)
3866 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 256(%rcx)
3867 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 320(%rcx)
3868 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 384(%rcx)
3869 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 448(%rcx)
3870 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 512(%rcx)
3871 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 576(%rcx)
3872 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 640(%rcx)
3873 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 704(%rcx)
3874 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, (%rcx)
3875 ; AVX512BW-FCP-NEXT: vzeroupper
3876 ; AVX512BW-FCP-NEXT: retq
3878 ; AVX512DQ-BW-LABEL: store_i32_stride3_vf64:
3879 ; AVX512DQ-BW: # %bb.0:
3880 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm3
3881 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2
3882 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm1
3883 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm0
3884 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm4
3885 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm5
3886 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm6
3887 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm7
3888 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm8
3889 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm9
3890 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdx), %zmm10
3891 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm11
3892 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5]
3893 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm13
3894 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm12, %zmm13
3895 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
3896 ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm14, %zmm13
3897 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0]
3898 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16
3899 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm15, %zmm16
3900 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31]
3901 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm17, %zmm16
3902 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10]
3903 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm19
3904 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm19
3905 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15]
3906 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm20, %zmm19
3907 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm0
3908 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm14, %zmm0
3909 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7
3910 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm15, %zmm7
3911 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm17, %zmm7
3912 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm11
3913 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm18, %zmm11
3914 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm20, %zmm11
3915 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm12, %zmm1
3916 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm14, %zmm1
3917 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm6
3918 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm15, %zmm6
3919 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm17, %zmm6
3920 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm10
3921 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm18, %zmm10
3922 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm20, %zmm10
3923 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm12, %zmm2
3924 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm14, %zmm2
3925 ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm15
3926 ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm17, %zmm15
3927 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm18, %zmm4
3928 ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm20, %zmm4
3929 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 64(%rcx)
3930 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 128(%rcx)
3931 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 192(%rcx)
3932 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 256(%rcx)
3933 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 320(%rcx)
3934 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 384(%rcx)
3935 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 448(%rcx)
3936 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 512(%rcx)
3937 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 576(%rcx)
3938 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 640(%rcx)
3939 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 704(%rcx)
3940 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, (%rcx)
3941 ; AVX512DQ-BW-NEXT: vzeroupper
3942 ; AVX512DQ-BW-NEXT: retq
3944 ; AVX512DQ-BW-FCP-LABEL: store_i32_stride3_vf64:
3945 ; AVX512DQ-BW-FCP: # %bb.0:
3946 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3
3947 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
3948 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
3949 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm0
3950 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm4
3951 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5
3952 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm6
3953 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm7
3954 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm8
3955 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm9
3956 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm10
3957 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm11
3958 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5]
3959 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13
3960 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm13
3961 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
3962 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm14, %zmm13
3963 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0]
3964 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16
3965 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm16
3966 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31]
3967 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm17, %zmm16
3968 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10]
3969 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm19
3970 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm19
3971 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15]
3972 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm20, %zmm19
3973 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm0
3974 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm0
3975 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7
3976 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm15, %zmm7
3977 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm17, %zmm7
3978 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11
3979 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm11
3980 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm20, %zmm11
3981 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm1
3982 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm14, %zmm1
3983 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6
3984 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm15, %zmm6
3985 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm17, %zmm6
3986 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10
3987 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm18, %zmm10
3988 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm20, %zmm10
3989 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm12, %zmm2
3990 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm14, %zmm2
3991 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm15
3992 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm17, %zmm15
3993 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm18, %zmm4
3994 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm20, %zmm4
3995 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rcx)
3996 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 128(%rcx)
3997 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%rcx)
3998 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 256(%rcx)
3999 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 320(%rcx)
4000 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 384(%rcx)
4001 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 448(%rcx)
4002 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 512(%rcx)
4003 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 576(%rcx)
4004 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 640(%rcx)
4005 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 704(%rcx)
4006 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, (%rcx)
4007 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
4008 ; AVX512DQ-BW-FCP-NEXT: retq
4009 %in.vec0 = load <64 x i32>, ptr %in.vecptr0, align 64
4010 %in.vec1 = load <64 x i32>, ptr %in.vecptr1, align 64
4011 %in.vec2 = load <64 x i32>, ptr %in.vecptr2, align 64
4012 %1 = shufflevector <64 x i32> %in.vec0, <64 x i32> %in.vec1, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
4013 %2 = shufflevector <64 x i32> %in.vec2, <64 x i32> poison, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4014 %3 = shufflevector <128 x i32> %1, <128 x i32> %2, <192 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191>
4015 %interleaved.vec = shufflevector <192 x i32> %3, <192 x i32> poison, <192 x i32> <i32 0, i32 64, i32 128, i32 1, i32 65, i32 129, i32 2, i32 66, i32 130, i32 3, i32 67, i32 131, i32 4, i32 68, i32 132, i32 5, i32 69, i32 133, i32 6, i32 70, i32 134, i32 7, i32 71, i32 135, i32 8, i32 72, i32 136, i32 9, i32 73, i32 137, i32 10, i32 74, i32 138, i32 11, i32 75, i32 139, i32 12, i32 76, i32 140, i32 13, i32 77, i32 141, i32 14, i32 78, i32 142, i32 15, i32 79, i32 143, i32 16, i32 80, i32 144, i32 17, i32 81, i32 145, i32 18, i32 82, i32 146, i32 19, i32 83, i32 147, i32 20, i32 84, i32 148, i32 21, i32 85, i32 149, i32 22, i32 86, i32 150, i32 23, i32 87, i32 151, i32 24, i32 88, i32 152, i32 25, i32 89, i32 153, i32 26, i32 90, i32 154, i32 27, i32 91, i32 155, i32 28, i32 92, i32 156, i32 29, i32 93, i32 157, i32 30, i32 94, i32 158, i32 31, i32 95, i32 159, i32 32, i32 96, i32 160, i32 33, i32 97, i32 161, i32 34, i32 98, i32 162, i32 35, i32 99, i32 163, i32 36, i32 100, i32 164, i32 37, i32 101, i32 165, i32 38, i32 102, i32 166, i32 39, i32 103, i32 167, i32 40, i32 104, i32 168, i32 41, i32 105, i32 169, i32 42, i32 106, i32 170, i32 43, i32 107, i32 171, i32 44, i32 108, i32 172, i32 45, i32 109, i32 173, i32 46, i32 110, i32 174, i32 47, i32 111, i32 175, i32 48, i32 112, i32 176, i32 49, i32 113, i32 177, i32 50, i32 114, i32 178, i32 51, i32 115, i32 179, i32 52, i32 116, i32 180, i32 53, i32 117, i32 181, i32 54, i32 118, i32 182, i32 55, i32 119, i32 183, i32 56, i32 120, i32 184, i32 57, i32 121, i32 185, i32 58, i32 122, i32 186, i32 59, i32 123, i32 187, i32 60, i32 124, i32 188, i32 61, i32 125, i32 189, i32 62, i32 126, i32 190, i32 63, i32 127, i32 191>
4016 store <192 x i32> %interleaved.vec, ptr %out.vec, align 64