1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
16 ; These patterns are produced by LoopVectorizer for interleaved stores.
18 define void @store_i64_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %out.vec) nounwind {
19 ; SSE-LABEL: store_i64_stride4_vf2:
21 ; SSE-NEXT: movaps (%rdi), %xmm0
22 ; SSE-NEXT: movaps (%rsi), %xmm1
23 ; SSE-NEXT: movaps (%rdx), %xmm2
24 ; SSE-NEXT: movaps (%rcx), %xmm3
25 ; SSE-NEXT: movaps %xmm0, %xmm4
26 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0]
27 ; SSE-NEXT: movaps %xmm2, %xmm5
28 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0]
29 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
30 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
31 ; SSE-NEXT: movaps %xmm0, 32(%r8)
32 ; SSE-NEXT: movaps %xmm2, 48(%r8)
33 ; SSE-NEXT: movaps %xmm5, 16(%r8)
34 ; SSE-NEXT: movaps %xmm4, (%r8)
37 ; AVX-LABEL: store_i64_stride4_vf2:
39 ; AVX-NEXT: vmovaps (%rdi), %xmm0
40 ; AVX-NEXT: vmovaps (%rsi), %xmm1
41 ; AVX-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1
42 ; AVX-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm0
43 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
44 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
45 ; AVX-NEXT: vmovaps %ymm0, 32(%r8)
46 ; AVX-NEXT: vmovaps %ymm2, (%r8)
47 ; AVX-NEXT: vzeroupper
50 ; AVX2-LABEL: store_i64_stride4_vf2:
52 ; AVX2-NEXT: vmovaps (%rdi), %xmm0
53 ; AVX2-NEXT: vmovaps (%rdx), %xmm1
54 ; AVX2-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
55 ; AVX2-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1
56 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
57 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
58 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
59 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
60 ; AVX2-NEXT: vmovaps %ymm0, 32(%r8)
61 ; AVX2-NEXT: vmovaps %ymm2, (%r8)
62 ; AVX2-NEXT: vzeroupper
65 ; AVX2-FP-LABEL: store_i64_stride4_vf2:
67 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm0
68 ; AVX2-FP-NEXT: vmovaps (%rdx), %xmm1
69 ; AVX2-FP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
70 ; AVX2-FP-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1
71 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
72 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
73 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
74 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
75 ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%r8)
76 ; AVX2-FP-NEXT: vmovaps %ymm2, (%r8)
77 ; AVX2-FP-NEXT: vzeroupper
80 ; AVX2-FCP-LABEL: store_i64_stride4_vf2:
82 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm0
83 ; AVX2-FCP-NEXT: vmovaps (%rdx), %xmm1
84 ; AVX2-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
85 ; AVX2-FCP-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1
86 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
87 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
88 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
89 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
90 ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%r8)
91 ; AVX2-FCP-NEXT: vmovaps %ymm2, (%r8)
92 ; AVX2-FCP-NEXT: vzeroupper
95 ; AVX512-LABEL: store_i64_stride4_vf2:
97 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
98 ; AVX512-NEXT: vmovdqa (%rdx), %xmm1
99 ; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
100 ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
101 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,8,10,1,3,9,11]
102 ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
103 ; AVX512-NEXT: vmovdqa64 %zmm2, (%r8)
104 ; AVX512-NEXT: vzeroupper
107 ; AVX512-FCP-LABEL: store_i64_stride4_vf2:
108 ; AVX512-FCP: # %bb.0:
109 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
110 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1
111 ; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
112 ; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
113 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,8,10,1,3,9,11]
114 ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
115 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%r8)
116 ; AVX512-FCP-NEXT: vzeroupper
117 ; AVX512-FCP-NEXT: retq
119 ; AVX512DQ-LABEL: store_i64_stride4_vf2:
121 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
122 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1
123 ; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
124 ; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
125 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,8,10,1,3,9,11]
126 ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
127 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%r8)
128 ; AVX512DQ-NEXT: vzeroupper
129 ; AVX512DQ-NEXT: retq
131 ; AVX512DQ-FCP-LABEL: store_i64_stride4_vf2:
132 ; AVX512DQ-FCP: # %bb.0:
133 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
134 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1
135 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
136 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
137 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,8,10,1,3,9,11]
138 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
139 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%r8)
140 ; AVX512DQ-FCP-NEXT: vzeroupper
141 ; AVX512DQ-FCP-NEXT: retq
143 ; AVX512BW-LABEL: store_i64_stride4_vf2:
145 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
146 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1
147 ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
148 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
149 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,8,10,1,3,9,11]
150 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
151 ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r8)
152 ; AVX512BW-NEXT: vzeroupper
153 ; AVX512BW-NEXT: retq
155 ; AVX512BW-FCP-LABEL: store_i64_stride4_vf2:
156 ; AVX512BW-FCP: # %bb.0:
157 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
158 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
159 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
160 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
161 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,8,10,1,3,9,11]
162 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
163 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%r8)
164 ; AVX512BW-FCP-NEXT: vzeroupper
165 ; AVX512BW-FCP-NEXT: retq
167 ; AVX512DQ-BW-LABEL: store_i64_stride4_vf2:
168 ; AVX512DQ-BW: # %bb.0:
169 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
170 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1
171 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
172 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
173 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,8,10,1,3,9,11]
174 ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
175 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%r8)
176 ; AVX512DQ-BW-NEXT: vzeroupper
177 ; AVX512DQ-BW-NEXT: retq
179 ; AVX512DQ-BW-FCP-LABEL: store_i64_stride4_vf2:
180 ; AVX512DQ-BW-FCP: # %bb.0:
181 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
182 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
183 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
184 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
185 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,8,10,1,3,9,11]
186 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
187 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%r8)
188 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
189 ; AVX512DQ-BW-FCP-NEXT: retq
190 %in.vec0 = load <2 x i64>, ptr %in.vecptr0, align 64
191 %in.vec1 = load <2 x i64>, ptr %in.vecptr1, align 64
192 %in.vec2 = load <2 x i64>, ptr %in.vecptr2, align 64
193 %in.vec3 = load <2 x i64>, ptr %in.vecptr3, align 64
194 %1 = shufflevector <2 x i64> %in.vec0, <2 x i64> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
195 %2 = shufflevector <2 x i64> %in.vec2, <2 x i64> %in.vec3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
196 %3 = shufflevector <4 x i64> %1, <4 x i64> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
197 %interleaved.vec = shufflevector <8 x i64> %3, <8 x i64> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
198 store <8 x i64> %interleaved.vec, ptr %out.vec, align 64
202 define void @store_i64_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %out.vec) nounwind {
203 ; SSE-LABEL: store_i64_stride4_vf4:
205 ; SSE-NEXT: movaps (%rdi), %xmm0
206 ; SSE-NEXT: movaps 16(%rdi), %xmm1
207 ; SSE-NEXT: movaps (%rsi), %xmm2
208 ; SSE-NEXT: movaps 16(%rsi), %xmm3
209 ; SSE-NEXT: movaps (%rdx), %xmm4
210 ; SSE-NEXT: movaps 16(%rdx), %xmm5
211 ; SSE-NEXT: movaps (%rcx), %xmm6
212 ; SSE-NEXT: movaps 16(%rcx), %xmm7
213 ; SSE-NEXT: movaps %xmm4, %xmm8
214 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm6[0]
215 ; SSE-NEXT: movaps %xmm0, %xmm9
216 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm2[0]
217 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1]
218 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
219 ; SSE-NEXT: movaps %xmm5, %xmm2
220 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm7[0]
221 ; SSE-NEXT: movaps %xmm1, %xmm6
222 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0]
223 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1]
224 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
225 ; SSE-NEXT: movaps %xmm1, 96(%r8)
226 ; SSE-NEXT: movaps %xmm5, 112(%r8)
227 ; SSE-NEXT: movaps %xmm6, 64(%r8)
228 ; SSE-NEXT: movaps %xmm2, 80(%r8)
229 ; SSE-NEXT: movaps %xmm0, 32(%r8)
230 ; SSE-NEXT: movaps %xmm4, 48(%r8)
231 ; SSE-NEXT: movaps %xmm9, (%r8)
232 ; SSE-NEXT: movaps %xmm8, 16(%r8)
235 ; AVX-LABEL: store_i64_stride4_vf4:
237 ; AVX-NEXT: vmovaps (%rdi), %ymm0
238 ; AVX-NEXT: vmovaps (%rsi), %ymm1
239 ; AVX-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm2
240 ; AVX-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm3
241 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
242 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],mem[2,3]
243 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
244 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
245 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
246 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
247 ; AVX-NEXT: vmovaps %ymm5, 64(%r8)
248 ; AVX-NEXT: vmovaps %ymm0, 96(%r8)
249 ; AVX-NEXT: vmovaps %ymm2, 32(%r8)
250 ; AVX-NEXT: vmovaps %ymm4, (%r8)
251 ; AVX-NEXT: vzeroupper
254 ; AVX2-LABEL: store_i64_stride4_vf4:
256 ; AVX2-NEXT: vmovaps (%rdi), %ymm0
257 ; AVX2-NEXT: vmovaps (%rsi), %ymm1
258 ; AVX2-NEXT: vmovaps (%rdx), %ymm2
259 ; AVX2-NEXT: vmovaps (%rcx), %ymm3
260 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
261 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
262 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
263 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
264 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
265 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
266 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
267 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
268 ; AVX2-NEXT: vmovaps %ymm0, 96(%r8)
269 ; AVX2-NEXT: vmovaps %ymm3, 64(%r8)
270 ; AVX2-NEXT: vmovaps %ymm4, 32(%r8)
271 ; AVX2-NEXT: vmovaps %ymm2, (%r8)
272 ; AVX2-NEXT: vzeroupper
275 ; AVX2-FP-LABEL: store_i64_stride4_vf4:
277 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0
278 ; AVX2-FP-NEXT: vmovaps (%rsi), %ymm1
279 ; AVX2-FP-NEXT: vmovaps (%rdx), %ymm2
280 ; AVX2-FP-NEXT: vmovaps (%rcx), %ymm3
281 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
282 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
283 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
284 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
285 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
286 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
287 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
288 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
289 ; AVX2-FP-NEXT: vmovaps %ymm0, 96(%r8)
290 ; AVX2-FP-NEXT: vmovaps %ymm3, 64(%r8)
291 ; AVX2-FP-NEXT: vmovaps %ymm4, 32(%r8)
292 ; AVX2-FP-NEXT: vmovaps %ymm2, (%r8)
293 ; AVX2-FP-NEXT: vzeroupper
296 ; AVX2-FCP-LABEL: store_i64_stride4_vf4:
298 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0
299 ; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm1
300 ; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm2
301 ; AVX2-FCP-NEXT: vmovaps (%rcx), %ymm3
302 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
303 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
304 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
305 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
306 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
307 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
308 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
309 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
310 ; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%r8)
311 ; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%r8)
312 ; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%r8)
313 ; AVX2-FCP-NEXT: vmovaps %ymm2, (%r8)
314 ; AVX2-FCP-NEXT: vzeroupper
315 ; AVX2-FCP-NEXT: retq
317 ; AVX512-LABEL: store_i64_stride4_vf4:
319 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0
320 ; AVX512-NEXT: vmovdqa (%rsi), %ymm1
321 ; AVX512-NEXT: vmovdqa (%rdx), %ymm2
322 ; AVX512-NEXT: vmovdqa (%rcx), %ymm3
323 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
324 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
325 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
326 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
327 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
328 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
329 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
330 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
331 ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm1
332 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0
333 ; AVX512-NEXT: vmovdqa64 %zmm0, 64(%r8)
334 ; AVX512-NEXT: vmovdqa64 %zmm1, (%r8)
335 ; AVX512-NEXT: vzeroupper
338 ; AVX512-FCP-LABEL: store_i64_stride4_vf4:
339 ; AVX512-FCP: # %bb.0:
340 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0
341 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm1
342 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm2
343 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm3
344 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
345 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
346 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
347 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
348 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,2,10,1,9,3,11]
349 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm4
350 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
351 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8)
352 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%r8)
353 ; AVX512-FCP-NEXT: vzeroupper
354 ; AVX512-FCP-NEXT: retq
356 ; AVX512DQ-LABEL: store_i64_stride4_vf4:
358 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
359 ; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm1
360 ; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm2
361 ; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm3
362 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
363 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
364 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
365 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
366 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
367 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
368 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
369 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
370 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm1
371 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0
372 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%r8)
373 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%r8)
374 ; AVX512DQ-NEXT: vzeroupper
375 ; AVX512DQ-NEXT: retq
377 ; AVX512DQ-FCP-LABEL: store_i64_stride4_vf4:
378 ; AVX512DQ-FCP: # %bb.0:
379 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0
380 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm1
381 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm2
382 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm3
383 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
384 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
385 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
386 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
387 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,2,10,1,9,3,11]
388 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm4
389 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
390 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8)
391 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%r8)
392 ; AVX512DQ-FCP-NEXT: vzeroupper
393 ; AVX512DQ-FCP-NEXT: retq
395 ; AVX512BW-LABEL: store_i64_stride4_vf4:
397 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
398 ; AVX512BW-NEXT: vmovdqa (%rsi), %ymm1
399 ; AVX512BW-NEXT: vmovdqa (%rdx), %ymm2
400 ; AVX512BW-NEXT: vmovdqa (%rcx), %ymm3
401 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
402 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
403 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
404 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
405 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
406 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
407 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
408 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
409 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm1
410 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0
411 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%r8)
412 ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%r8)
413 ; AVX512BW-NEXT: vzeroupper
414 ; AVX512BW-NEXT: retq
416 ; AVX512BW-FCP-LABEL: store_i64_stride4_vf4:
417 ; AVX512BW-FCP: # %bb.0:
418 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0
419 ; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm1
420 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm2
421 ; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm3
422 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
423 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
424 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
425 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
426 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,2,10,1,9,3,11]
427 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm4
428 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
429 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8)
430 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%r8)
431 ; AVX512BW-FCP-NEXT: vzeroupper
432 ; AVX512BW-FCP-NEXT: retq
434 ; AVX512DQ-BW-LABEL: store_i64_stride4_vf4:
435 ; AVX512DQ-BW: # %bb.0:
436 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0
437 ; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %ymm1
438 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm2
439 ; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm3
440 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
441 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
442 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
443 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
444 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
445 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
446 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
447 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
448 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm1
449 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0
450 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%r8)
451 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%r8)
452 ; AVX512DQ-BW-NEXT: vzeroupper
453 ; AVX512DQ-BW-NEXT: retq
455 ; AVX512DQ-BW-FCP-LABEL: store_i64_stride4_vf4:
456 ; AVX512DQ-BW-FCP: # %bb.0:
457 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0
458 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm1
459 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm2
460 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm3
461 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
462 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
463 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
464 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
465 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,2,10,1,9,3,11]
466 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm4
467 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
468 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8)
469 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%r8)
470 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
471 ; AVX512DQ-BW-FCP-NEXT: retq
472 %in.vec0 = load <4 x i64>, ptr %in.vecptr0, align 64
473 %in.vec1 = load <4 x i64>, ptr %in.vecptr1, align 64
474 %in.vec2 = load <4 x i64>, ptr %in.vecptr2, align 64
475 %in.vec3 = load <4 x i64>, ptr %in.vecptr3, align 64
476 %1 = shufflevector <4 x i64> %in.vec0, <4 x i64> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
477 %2 = shufflevector <4 x i64> %in.vec2, <4 x i64> %in.vec3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
478 %3 = shufflevector <8 x i64> %1, <8 x i64> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
479 %interleaved.vec = shufflevector <16 x i64> %3, <16 x i64> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
480 store <16 x i64> %interleaved.vec, ptr %out.vec, align 64
484 define void @store_i64_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %out.vec) nounwind {
485 ; SSE-LABEL: store_i64_stride4_vf8:
487 ; SSE-NEXT: movaps (%rdi), %xmm5
488 ; SSE-NEXT: movaps 16(%rdi), %xmm1
489 ; SSE-NEXT: movaps 32(%rdi), %xmm3
490 ; SSE-NEXT: movaps 48(%rdi), %xmm0
491 ; SSE-NEXT: movaps (%rsi), %xmm10
492 ; SSE-NEXT: movaps 16(%rsi), %xmm14
493 ; SSE-NEXT: movaps 32(%rsi), %xmm11
494 ; SSE-NEXT: movaps (%rdx), %xmm2
495 ; SSE-NEXT: movaps 16(%rdx), %xmm4
496 ; SSE-NEXT: movaps 32(%rdx), %xmm7
497 ; SSE-NEXT: movaps 48(%rdx), %xmm9
498 ; SSE-NEXT: movaps (%rcx), %xmm8
499 ; SSE-NEXT: movaps 16(%rcx), %xmm13
500 ; SSE-NEXT: movaps 32(%rcx), %xmm15
501 ; SSE-NEXT: movaps 48(%rcx), %xmm12
502 ; SSE-NEXT: movaps %xmm2, %xmm6
503 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm8[0]
504 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
505 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm8[1]
506 ; SSE-NEXT: movaps %xmm5, %xmm8
507 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm10[0]
508 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm10[1]
509 ; SSE-NEXT: movaps %xmm4, %xmm10
510 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm13[0]
511 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm13[1]
512 ; SSE-NEXT: movaps %xmm1, %xmm13
513 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm14[0]
514 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm14[1]
515 ; SSE-NEXT: movaps %xmm7, %xmm14
516 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0]
517 ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm15[1]
518 ; SSE-NEXT: movaps %xmm3, %xmm15
519 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm11[0]
520 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm11[1]
521 ; SSE-NEXT: movaps %xmm9, %xmm11
522 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm12[0]
523 ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm12[1]
524 ; SSE-NEXT: movaps 48(%rsi), %xmm12
525 ; SSE-NEXT: movaps %xmm0, %xmm6
526 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm12[0]
527 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1]
528 ; SSE-NEXT: movaps %xmm0, 224(%r8)
529 ; SSE-NEXT: movaps %xmm9, 240(%r8)
530 ; SSE-NEXT: movaps %xmm6, 192(%r8)
531 ; SSE-NEXT: movaps %xmm11, 208(%r8)
532 ; SSE-NEXT: movaps %xmm3, 160(%r8)
533 ; SSE-NEXT: movaps %xmm7, 176(%r8)
534 ; SSE-NEXT: movaps %xmm15, 128(%r8)
535 ; SSE-NEXT: movaps %xmm14, 144(%r8)
536 ; SSE-NEXT: movaps %xmm1, 96(%r8)
537 ; SSE-NEXT: movaps %xmm4, 112(%r8)
538 ; SSE-NEXT: movaps %xmm13, 64(%r8)
539 ; SSE-NEXT: movaps %xmm10, 80(%r8)
540 ; SSE-NEXT: movaps %xmm5, 32(%r8)
541 ; SSE-NEXT: movaps %xmm2, 48(%r8)
542 ; SSE-NEXT: movaps %xmm8, (%r8)
543 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
544 ; SSE-NEXT: movaps %xmm0, 16(%r8)
547 ; AVX-LABEL: store_i64_stride4_vf8:
549 ; AVX-NEXT: vmovaps (%rdx), %ymm2
550 ; AVX-NEXT: vmovaps 32(%rdx), %ymm1
551 ; AVX-NEXT: vmovaps (%rcx), %ymm3
552 ; AVX-NEXT: vmovaps 32(%rcx), %ymm4
553 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm4[0],ymm1[2],ymm4[2]
554 ; AVX-NEXT: vmovaps 48(%rsi), %xmm5
555 ; AVX-NEXT: vmovaps 48(%rdi), %xmm6
556 ; AVX-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm5[0]
557 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
558 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3]
559 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm6[1],xmm5[1]
560 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
561 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
562 ; AVX-NEXT: vmovaps 16(%rsi), %xmm5
563 ; AVX-NEXT: vmovaps 16(%rdi), %xmm6
564 ; AVX-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm5[0]
565 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7]
566 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
567 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm5[1]
568 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
569 ; AVX-NEXT: vmovaps 32(%rcx), %xmm3
570 ; AVX-NEXT: vmovaps 32(%rdx), %xmm5
571 ; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm3[0]
572 ; AVX-NEXT: vmovaps 32(%rsi), %xmm7
573 ; AVX-NEXT: vmovaps 32(%rdi), %xmm8
574 ; AVX-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm7[0]
575 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm5[1],xmm3[1]
576 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm8[1],xmm7[1]
577 ; AVX-NEXT: vmovaps (%rcx), %xmm7
578 ; AVX-NEXT: vmovaps (%rdx), %xmm8
579 ; AVX-NEXT: vmovlhps {{.*#+}} xmm10 = xmm8[0],xmm7[0]
580 ; AVX-NEXT: vmovaps (%rsi), %xmm11
581 ; AVX-NEXT: vmovaps (%rdi), %xmm12
582 ; AVX-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm11[0]
583 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm7[1]
584 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm12[1],xmm11[1]
585 ; AVX-NEXT: vmovaps %xmm8, 32(%r8)
586 ; AVX-NEXT: vmovaps %xmm7, 48(%r8)
587 ; AVX-NEXT: vmovaps %xmm13, (%r8)
588 ; AVX-NEXT: vmovaps %xmm10, 16(%r8)
589 ; AVX-NEXT: vmovaps %xmm5, 160(%r8)
590 ; AVX-NEXT: vmovaps %xmm3, 176(%r8)
591 ; AVX-NEXT: vmovaps %xmm9, 128(%r8)
592 ; AVX-NEXT: vmovaps %xmm6, 144(%r8)
593 ; AVX-NEXT: vmovaps %ymm2, 96(%r8)
594 ; AVX-NEXT: vmovaps %ymm4, 64(%r8)
595 ; AVX-NEXT: vmovaps %ymm1, 224(%r8)
596 ; AVX-NEXT: vmovaps %ymm0, 192(%r8)
597 ; AVX-NEXT: vzeroupper
600 ; AVX2-LABEL: store_i64_stride4_vf8:
602 ; AVX2-NEXT: vmovaps (%rdi), %ymm0
603 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm1
604 ; AVX2-NEXT: vmovaps (%rsi), %ymm2
605 ; AVX2-NEXT: vmovaps 32(%rsi), %ymm3
606 ; AVX2-NEXT: vmovaps (%rdx), %ymm4
607 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm5
608 ; AVX2-NEXT: vmovaps (%rcx), %ymm6
609 ; AVX2-NEXT: vmovaps 32(%rcx), %ymm7
610 ; AVX2-NEXT: vmovaps (%rsi), %xmm8
611 ; AVX2-NEXT: vmovaps 32(%rsi), %xmm9
612 ; AVX2-NEXT: vinsertf128 $1, (%rcx), %ymm8, %ymm8
613 ; AVX2-NEXT: vmovaps (%rdi), %xmm10
614 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm11
615 ; AVX2-NEXT: vinsertf128 $1, (%rdx), %ymm10, %ymm10
616 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm10[1],ymm8[1],ymm10[3],ymm8[3]
617 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[2],ymm8[2]
618 ; AVX2-NEXT: vinsertf128 $1, 32(%rcx), %ymm9, %ymm9
619 ; AVX2-NEXT: vinsertf128 $1, 32(%rdx), %ymm11, %ymm10
620 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[2],ymm9[2]
621 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm10[1],ymm9[1],ymm10[3],ymm9[3]
622 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm5[0],ymm7[0],ymm5[2],ymm7[2]
623 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
624 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3]
625 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm7[1],ymm5[3],ymm7[3]
626 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3]
627 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
628 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
629 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
630 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3]
631 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
632 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
633 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3]
634 ; AVX2-NEXT: vmovaps %ymm0, 96(%r8)
635 ; AVX2-NEXT: vmovaps %ymm3, 64(%r8)
636 ; AVX2-NEXT: vmovaps %ymm1, 224(%r8)
637 ; AVX2-NEXT: vmovaps %ymm10, 192(%r8)
638 ; AVX2-NEXT: vmovaps %ymm9, 160(%r8)
639 ; AVX2-NEXT: vmovaps %ymm11, 128(%r8)
640 ; AVX2-NEXT: vmovaps %ymm8, (%r8)
641 ; AVX2-NEXT: vmovaps %ymm12, 32(%r8)
642 ; AVX2-NEXT: vzeroupper
645 ; AVX2-FP-LABEL: store_i64_stride4_vf8:
647 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0
648 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm1
649 ; AVX2-FP-NEXT: vmovaps (%rsi), %ymm2
650 ; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm3
651 ; AVX2-FP-NEXT: vmovaps (%rdx), %ymm4
652 ; AVX2-FP-NEXT: vmovaps 32(%rdx), %ymm5
653 ; AVX2-FP-NEXT: vmovaps (%rcx), %ymm6
654 ; AVX2-FP-NEXT: vmovaps 32(%rcx), %ymm7
655 ; AVX2-FP-NEXT: vmovaps (%rsi), %xmm8
656 ; AVX2-FP-NEXT: vmovaps 32(%rsi), %xmm9
657 ; AVX2-FP-NEXT: vinsertf128 $1, (%rcx), %ymm8, %ymm8
658 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm10
659 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm11
660 ; AVX2-FP-NEXT: vinsertf128 $1, (%rdx), %ymm10, %ymm10
661 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm10[1],ymm8[1],ymm10[3],ymm8[3]
662 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[2],ymm8[2]
663 ; AVX2-FP-NEXT: vinsertf128 $1, 32(%rcx), %ymm9, %ymm9
664 ; AVX2-FP-NEXT: vinsertf128 $1, 32(%rdx), %ymm11, %ymm10
665 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[2],ymm9[2]
666 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm10[1],ymm9[1],ymm10[3],ymm9[3]
667 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm5[0],ymm7[0],ymm5[2],ymm7[2]
668 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
669 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3]
670 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm7[1],ymm5[3],ymm7[3]
671 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3]
672 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
673 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
674 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
675 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3]
676 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
677 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
678 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3]
679 ; AVX2-FP-NEXT: vmovaps %ymm0, 96(%r8)
680 ; AVX2-FP-NEXT: vmovaps %ymm3, 64(%r8)
681 ; AVX2-FP-NEXT: vmovaps %ymm1, 224(%r8)
682 ; AVX2-FP-NEXT: vmovaps %ymm10, 192(%r8)
683 ; AVX2-FP-NEXT: vmovaps %ymm9, 160(%r8)
684 ; AVX2-FP-NEXT: vmovaps %ymm11, 128(%r8)
685 ; AVX2-FP-NEXT: vmovaps %ymm8, (%r8)
686 ; AVX2-FP-NEXT: vmovaps %ymm12, 32(%r8)
687 ; AVX2-FP-NEXT: vzeroupper
690 ; AVX2-FCP-LABEL: store_i64_stride4_vf8:
692 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0
693 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm1
694 ; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm2
695 ; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm3
696 ; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm4
697 ; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm5
698 ; AVX2-FCP-NEXT: vmovaps (%rcx), %ymm6
699 ; AVX2-FCP-NEXT: vmovaps 32(%rcx), %ymm7
700 ; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm8
701 ; AVX2-FCP-NEXT: vmovaps 32(%rsi), %xmm9
702 ; AVX2-FCP-NEXT: vinsertf128 $1, (%rcx), %ymm8, %ymm8
703 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm10
704 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm11
705 ; AVX2-FCP-NEXT: vinsertf128 $1, (%rdx), %ymm10, %ymm10
706 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm10[1],ymm8[1],ymm10[3],ymm8[3]
707 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[2],ymm8[2]
708 ; AVX2-FCP-NEXT: vinsertf128 $1, 32(%rcx), %ymm9, %ymm9
709 ; AVX2-FCP-NEXT: vinsertf128 $1, 32(%rdx), %ymm11, %ymm10
710 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[2],ymm9[2]
711 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm10[1],ymm9[1],ymm10[3],ymm9[3]
712 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm5[0],ymm7[0],ymm5[2],ymm7[2]
713 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
714 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3]
715 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm7[1],ymm5[3],ymm7[3]
716 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3]
717 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
718 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
719 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
720 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3]
721 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
722 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
723 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3]
724 ; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%r8)
725 ; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%r8)
726 ; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%r8)
727 ; AVX2-FCP-NEXT: vmovaps %ymm10, 192(%r8)
728 ; AVX2-FCP-NEXT: vmovaps %ymm9, 160(%r8)
729 ; AVX2-FCP-NEXT: vmovaps %ymm11, 128(%r8)
730 ; AVX2-FCP-NEXT: vmovaps %ymm8, (%r8)
731 ; AVX2-FCP-NEXT: vmovaps %ymm12, 32(%r8)
732 ; AVX2-FCP-NEXT: vzeroupper
733 ; AVX2-FCP-NEXT: retq
735 ; AVX512-LABEL: store_i64_stride4_vf8:
737 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
738 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm1
739 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm2
740 ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm3
741 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,8,0,0,1,9]
742 ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
743 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,8,0,0,1,9,0,0]
744 ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm5
745 ; AVX512-NEXT: movb $-52, %al
746 ; AVX512-NEXT: kmovw %eax, %k1
747 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
748 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,2,10,0,0,3,11]
749 ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
750 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,10,0,0,3,11,0,0]
751 ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm6
752 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1}
753 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,4,12,0,0,5,13]
754 ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
755 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [4,12,0,0,5,13,0,0]
756 ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm7
757 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1}
758 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,6,14,0,0,7,15]
759 ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
760 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,7,15,0,0]
761 ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
762 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1}
763 ; AVX512-NEXT: vmovdqa64 %zmm2, 192(%r8)
764 ; AVX512-NEXT: vmovdqa64 %zmm7, 128(%r8)
765 ; AVX512-NEXT: vmovdqa64 %zmm6, 64(%r8)
766 ; AVX512-NEXT: vmovdqa64 %zmm5, (%r8)
767 ; AVX512-NEXT: vzeroupper
770 ; AVX512-FCP-LABEL: store_i64_stride4_vf8:
771 ; AVX512-FCP: # %bb.0:
772 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
773 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm1
774 ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm2
775 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm3
776 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,8,0,0,1,9]
777 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
778 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,8,0,0,1,9,0,0]
779 ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5
780 ; AVX512-FCP-NEXT: movb $-52, %al
781 ; AVX512-FCP-NEXT: kmovw %eax, %k1
782 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
783 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,2,10,0,0,3,11]
784 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
785 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,10,0,0,3,11,0,0]
786 ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6
787 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1}
788 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,4,12,0,0,5,13]
789 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
790 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [4,12,0,0,5,13,0,0]
791 ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm7
792 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1}
793 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,6,14,0,0,7,15]
794 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
795 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,7,15,0,0]
796 ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
797 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1}
798 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 192(%r8)
799 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 128(%r8)
800 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 64(%r8)
801 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%r8)
802 ; AVX512-FCP-NEXT: vzeroupper
803 ; AVX512-FCP-NEXT: retq
805 ; AVX512DQ-LABEL: store_i64_stride4_vf8:
807 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0
808 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm1
809 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm2
810 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm3
811 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,8,0,0,1,9]
812 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
813 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,8,0,0,1,9,0,0]
814 ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm5
815 ; AVX512DQ-NEXT: movb $-52, %al
816 ; AVX512DQ-NEXT: kmovw %eax, %k1
817 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
818 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,2,10,0,0,3,11]
819 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
820 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,10,0,0,3,11,0,0]
821 ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm6
822 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1}
823 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,4,12,0,0,5,13]
824 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
825 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [4,12,0,0,5,13,0,0]
826 ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm7
827 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1}
828 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,6,14,0,0,7,15]
829 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
830 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,7,15,0,0]
831 ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
832 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1}
833 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%r8)
834 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 128(%r8)
835 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 64(%r8)
836 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%r8)
837 ; AVX512DQ-NEXT: vzeroupper
838 ; AVX512DQ-NEXT: retq
840 ; AVX512DQ-FCP-LABEL: store_i64_stride4_vf8:
841 ; AVX512DQ-FCP: # %bb.0:
842 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
843 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm1
844 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm2
845 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm3
846 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,8,0,0,1,9]
847 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
848 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,8,0,0,1,9,0,0]
849 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5
850 ; AVX512DQ-FCP-NEXT: movb $-52, %al
851 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
852 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
853 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,2,10,0,0,3,11]
854 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
855 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,10,0,0,3,11,0,0]
856 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6
857 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1}
858 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,4,12,0,0,5,13]
859 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
860 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [4,12,0,0,5,13,0,0]
861 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm7
862 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1}
863 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,6,14,0,0,7,15]
864 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
865 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,7,15,0,0]
866 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
867 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1}
868 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 192(%r8)
869 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 128(%r8)
870 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 64(%r8)
871 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%r8)
872 ; AVX512DQ-FCP-NEXT: vzeroupper
873 ; AVX512DQ-FCP-NEXT: retq
875 ; AVX512BW-LABEL: store_i64_stride4_vf8:
877 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
878 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1
879 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2
880 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm3
881 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,8,0,0,1,9]
882 ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
883 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,8,0,0,1,9,0,0]
884 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm5
885 ; AVX512BW-NEXT: movb $-52, %al
886 ; AVX512BW-NEXT: kmovd %eax, %k1
887 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
888 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,2,10,0,0,3,11]
889 ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
890 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,10,0,0,3,11,0,0]
891 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6
892 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1}
893 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,4,12,0,0,5,13]
894 ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
895 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [4,12,0,0,5,13,0,0]
896 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7
897 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1}
898 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,6,14,0,0,7,15]
899 ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
900 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,7,15,0,0]
901 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
902 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1}
903 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%r8)
904 ; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%r8)
905 ; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%r8)
906 ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%r8)
907 ; AVX512BW-NEXT: vzeroupper
908 ; AVX512BW-NEXT: retq
910 ; AVX512BW-FCP-LABEL: store_i64_stride4_vf8:
911 ; AVX512BW-FCP: # %bb.0:
912 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
913 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1
914 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2
915 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm3
916 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,8,0,0,1,9]
917 ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
918 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,8,0,0,1,9,0,0]
919 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5
920 ; AVX512BW-FCP-NEXT: movb $-52, %al
921 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1
922 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
923 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,2,10,0,0,3,11]
924 ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
925 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,10,0,0,3,11,0,0]
926 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6
927 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1}
928 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,4,12,0,0,5,13]
929 ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
930 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [4,12,0,0,5,13,0,0]
931 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm7
932 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1}
933 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,6,14,0,0,7,15]
934 ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
935 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,7,15,0,0]
936 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
937 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1}
938 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%r8)
939 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 128(%r8)
940 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 64(%r8)
941 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%r8)
942 ; AVX512BW-FCP-NEXT: vzeroupper
943 ; AVX512BW-FCP-NEXT: retq
945 ; AVX512DQ-BW-LABEL: store_i64_stride4_vf8:
946 ; AVX512DQ-BW: # %bb.0:
947 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
948 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm1
949 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm2
950 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm3
951 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,8,0,0,1,9]
952 ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
953 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,8,0,0,1,9,0,0]
954 ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm5
955 ; AVX512DQ-BW-NEXT: movb $-52, %al
956 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1
957 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
958 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,2,10,0,0,3,11]
959 ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
960 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,10,0,0,3,11,0,0]
961 ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6
962 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1}
963 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,4,12,0,0,5,13]
964 ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
965 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [4,12,0,0,5,13,0,0]
966 ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7
967 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1}
968 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,6,14,0,0,7,15]
969 ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
970 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,7,15,0,0]
971 ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
972 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1}
973 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 192(%r8)
974 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 128(%r8)
975 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 64(%r8)
976 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%r8)
977 ; AVX512DQ-BW-NEXT: vzeroupper
978 ; AVX512DQ-BW-NEXT: retq
980 ; AVX512DQ-BW-FCP-LABEL: store_i64_stride4_vf8:
981 ; AVX512DQ-BW-FCP: # %bb.0:
982 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
983 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1
984 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2
985 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm3
986 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,8,0,0,1,9]
987 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
988 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,8,0,0,1,9,0,0]
989 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5
990 ; AVX512DQ-BW-FCP-NEXT: movb $-52, %al
991 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
992 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
993 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,2,10,0,0,3,11]
994 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
995 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,10,0,0,3,11,0,0]
996 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6
997 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1}
998 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,4,12,0,0,5,13]
999 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
1000 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [4,12,0,0,5,13,0,0]
1001 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm7
1002 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1}
1003 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,6,14,0,0,7,15]
1004 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
1005 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,7,15,0,0]
1006 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
1007 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1}
1008 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%r8)
1009 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 128(%r8)
1010 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 64(%r8)
1011 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%r8)
1012 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
1013 ; AVX512DQ-BW-FCP-NEXT: retq
1014 %in.vec0 = load <8 x i64>, ptr %in.vecptr0, align 64
1015 %in.vec1 = load <8 x i64>, ptr %in.vecptr1, align 64
1016 %in.vec2 = load <8 x i64>, ptr %in.vecptr2, align 64
1017 %in.vec3 = load <8 x i64>, ptr %in.vecptr3, align 64
1018 %1 = shufflevector <8 x i64> %in.vec0, <8 x i64> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1019 %2 = shufflevector <8 x i64> %in.vec2, <8 x i64> %in.vec3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1020 %3 = shufflevector <16 x i64> %1, <16 x i64> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1021 %interleaved.vec = shufflevector <32 x i64> %3, <32 x i64> poison, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
1022 store <32 x i64> %interleaved.vec, ptr %out.vec, align 64
1026 define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %out.vec) nounwind {
1027 ; SSE-LABEL: store_i64_stride4_vf16:
1029 ; SSE-NEXT: subq $152, %rsp
1030 ; SSE-NEXT: movaps (%rdi), %xmm7
1031 ; SSE-NEXT: movaps 16(%rdi), %xmm8
1032 ; SSE-NEXT: movaps 32(%rdi), %xmm9
1033 ; SSE-NEXT: movaps 48(%rdi), %xmm10
1034 ; SSE-NEXT: movaps (%rsi), %xmm3
1035 ; SSE-NEXT: movaps 16(%rsi), %xmm2
1036 ; SSE-NEXT: movaps 32(%rsi), %xmm1
1037 ; SSE-NEXT: movaps 48(%rsi), %xmm0
1038 ; SSE-NEXT: movaps (%rdx), %xmm11
1039 ; SSE-NEXT: movaps 16(%rdx), %xmm12
1040 ; SSE-NEXT: movaps 32(%rdx), %xmm14
1041 ; SSE-NEXT: movaps 48(%rdx), %xmm15
1042 ; SSE-NEXT: movaps (%rcx), %xmm4
1043 ; SSE-NEXT: movaps 16(%rcx), %xmm5
1044 ; SSE-NEXT: movaps 32(%rcx), %xmm6
1045 ; SSE-NEXT: movaps %xmm7, %xmm13
1046 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm3[0]
1047 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1048 ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1]
1049 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1050 ; SSE-NEXT: movaps %xmm11, %xmm7
1051 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm4[0]
1052 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1053 ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm4[1]
1054 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1055 ; SSE-NEXT: movaps %xmm8, %xmm3
1056 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
1057 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1058 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1]
1059 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1060 ; SSE-NEXT: movaps %xmm12, %xmm3
1061 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm5[0]
1062 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1063 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1]
1064 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1065 ; SSE-NEXT: movaps %xmm9, %xmm2
1066 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
1067 ; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill
1068 ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm1[1]
1069 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1070 ; SSE-NEXT: movaps %xmm14, %xmm1
1071 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0]
1072 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1073 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm6[1]
1074 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1075 ; SSE-NEXT: movaps %xmm10, %xmm1
1076 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1077 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1078 ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1]
1079 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1080 ; SSE-NEXT: movaps 48(%rcx), %xmm0
1081 ; SSE-NEXT: movaps %xmm15, %xmm1
1082 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1083 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1084 ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1]
1085 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1086 ; SSE-NEXT: movaps 64(%rdi), %xmm14
1087 ; SSE-NEXT: movaps 64(%rsi), %xmm0
1088 ; SSE-NEXT: movaps %xmm14, %xmm1
1089 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1090 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1091 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1]
1092 ; SSE-NEXT: movaps 64(%rdx), %xmm12
1093 ; SSE-NEXT: movaps 64(%rcx), %xmm0
1094 ; SSE-NEXT: movaps %xmm12, %xmm15
1095 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0]
1096 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1]
1097 ; SSE-NEXT: movaps 80(%rdi), %xmm11
1098 ; SSE-NEXT: movaps 80(%rsi), %xmm0
1099 ; SSE-NEXT: movaps %xmm11, %xmm13
1100 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0]
1101 ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1]
1102 ; SSE-NEXT: movaps 80(%rdx), %xmm7
1103 ; SSE-NEXT: movaps 80(%rcx), %xmm1
1104 ; SSE-NEXT: movaps %xmm7, %xmm10
1105 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0]
1106 ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1]
1107 ; SSE-NEXT: movaps 96(%rdi), %xmm8
1108 ; SSE-NEXT: movaps 96(%rsi), %xmm0
1109 ; SSE-NEXT: movaps %xmm8, %xmm9
1110 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0]
1111 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1]
1112 ; SSE-NEXT: movaps 96(%rdx), %xmm5
1113 ; SSE-NEXT: movaps 96(%rcx), %xmm0
1114 ; SSE-NEXT: movaps %xmm5, %xmm6
1115 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0]
1116 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
1117 ; SSE-NEXT: movaps 112(%rdi), %xmm2
1118 ; SSE-NEXT: movaps 112(%rsi), %xmm3
1119 ; SSE-NEXT: movaps %xmm2, %xmm4
1120 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0]
1121 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
1122 ; SSE-NEXT: movaps 112(%rdx), %xmm3
1123 ; SSE-NEXT: movaps 112(%rcx), %xmm1
1124 ; SSE-NEXT: movaps %xmm3, %xmm0
1125 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1126 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
1127 ; SSE-NEXT: movaps %xmm3, 496(%r8)
1128 ; SSE-NEXT: movaps %xmm2, 480(%r8)
1129 ; SSE-NEXT: movaps %xmm0, 464(%r8)
1130 ; SSE-NEXT: movaps %xmm4, 448(%r8)
1131 ; SSE-NEXT: movaps %xmm5, 432(%r8)
1132 ; SSE-NEXT: movaps %xmm8, 416(%r8)
1133 ; SSE-NEXT: movaps %xmm6, 400(%r8)
1134 ; SSE-NEXT: movaps %xmm9, 384(%r8)
1135 ; SSE-NEXT: movaps %xmm7, 368(%r8)
1136 ; SSE-NEXT: movaps %xmm11, 352(%r8)
1137 ; SSE-NEXT: movaps %xmm10, 336(%r8)
1138 ; SSE-NEXT: movaps %xmm13, 320(%r8)
1139 ; SSE-NEXT: movaps %xmm12, 304(%r8)
1140 ; SSE-NEXT: movaps %xmm14, 288(%r8)
1141 ; SSE-NEXT: movaps %xmm15, 272(%r8)
1142 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1143 ; SSE-NEXT: movaps %xmm0, 256(%r8)
1144 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1145 ; SSE-NEXT: movaps %xmm0, 240(%r8)
1146 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1147 ; SSE-NEXT: movaps %xmm0, 224(%r8)
1148 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1149 ; SSE-NEXT: movaps %xmm0, 208(%r8)
1150 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1151 ; SSE-NEXT: movaps %xmm0, 192(%r8)
1152 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1153 ; SSE-NEXT: movaps %xmm0, 176(%r8)
1154 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1155 ; SSE-NEXT: movaps %xmm0, 160(%r8)
1156 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1157 ; SSE-NEXT: movaps %xmm0, 144(%r8)
1158 ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
1159 ; SSE-NEXT: movaps %xmm0, 128(%r8)
1160 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1161 ; SSE-NEXT: movaps %xmm0, 112(%r8)
1162 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1163 ; SSE-NEXT: movaps %xmm0, 96(%r8)
1164 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1165 ; SSE-NEXT: movaps %xmm0, 80(%r8)
1166 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1167 ; SSE-NEXT: movaps %xmm0, 64(%r8)
1168 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1169 ; SSE-NEXT: movaps %xmm0, 48(%r8)
1170 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1171 ; SSE-NEXT: movaps %xmm0, 32(%r8)
1172 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1173 ; SSE-NEXT: movaps %xmm0, 16(%r8)
1174 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1175 ; SSE-NEXT: movaps %xmm0, (%r8)
1176 ; SSE-NEXT: addq $152, %rsp
1179 ; AVX-LABEL: store_i64_stride4_vf16:
1181 ; AVX-NEXT: subq $152, %rsp
1182 ; AVX-NEXT: vmovaps 96(%rdx), %ymm7
1183 ; AVX-NEXT: vmovaps 64(%rdx), %ymm5
1184 ; AVX-NEXT: vmovaps 32(%rdx), %ymm3
1185 ; AVX-NEXT: vmovaps (%rdx), %ymm1
1186 ; AVX-NEXT: vmovaps 96(%rcx), %ymm8
1187 ; AVX-NEXT: vmovaps 64(%rcx), %ymm6
1188 ; AVX-NEXT: vmovaps 32(%rcx), %ymm4
1189 ; AVX-NEXT: vmovaps (%rcx), %ymm2
1190 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
1191 ; AVX-NEXT: vmovaps 16(%rsi), %xmm9
1192 ; AVX-NEXT: vmovaps 16(%rdi), %xmm10
1193 ; AVX-NEXT: vmovlhps {{.*#+}} xmm11 = xmm10[0],xmm9[0]
1194 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
1195 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1196 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
1197 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm10[1],xmm9[1]
1198 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1199 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1200 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
1201 ; AVX-NEXT: vmovaps 48(%rsi), %xmm9
1202 ; AVX-NEXT: vmovaps 48(%rdi), %xmm10
1203 ; AVX-NEXT: vmovlhps {{.*#+}} xmm11 = xmm10[0],xmm9[0]
1204 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm2[4,5,6,7]
1205 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1206 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3]
1207 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm10[1],xmm9[1]
1208 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7]
1209 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1210 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm6[0],ymm5[2],ymm6[2]
1211 ; AVX-NEXT: vmovaps 80(%rsi), %xmm9
1212 ; AVX-NEXT: vmovaps 80(%rdi), %xmm10
1213 ; AVX-NEXT: vmovlhps {{.*#+}} xmm11 = xmm10[0],xmm9[0]
1214 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm4[4,5,6,7]
1215 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1216 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm6[1],ymm5[3],ymm6[3]
1217 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm10[1],xmm9[1]
1218 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5,6,7]
1219 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1220 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm7[0],ymm8[0],ymm7[2],ymm8[2]
1221 ; AVX-NEXT: vmovaps 112(%rsi), %xmm9
1222 ; AVX-NEXT: vmovaps 112(%rdi), %xmm10
1223 ; AVX-NEXT: vmovlhps {{.*#+}} xmm11 = xmm10[0],xmm9[0]
1224 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm6[4,5,6,7]
1225 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1226 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm8[1],ymm7[3],ymm8[3]
1227 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm10[1],xmm9[1]
1228 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7]
1229 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1230 ; AVX-NEXT: vmovaps 64(%rsi), %xmm10
1231 ; AVX-NEXT: vmovaps 64(%rdi), %xmm11
1232 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm11[0],xmm10[0]
1233 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1234 ; AVX-NEXT: vmovaps 32(%rcx), %xmm14
1235 ; AVX-NEXT: vmovaps 64(%rcx), %xmm12
1236 ; AVX-NEXT: vmovaps 64(%rdx), %xmm13
1237 ; AVX-NEXT: vmovlhps {{.*#+}} xmm9 = xmm13[0],xmm12[0]
1238 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm11[1],xmm10[1]
1239 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm13[1],xmm12[1]
1240 ; AVX-NEXT: vmovaps 32(%rsi), %xmm13
1241 ; AVX-NEXT: vmovaps 32(%rdi), %xmm15
1242 ; AVX-NEXT: vmovlhps {{.*#+}} xmm12 = xmm15[0],xmm13[0]
1243 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm15[1],xmm13[1]
1244 ; AVX-NEXT: vmovaps 32(%rdx), %xmm15
1245 ; AVX-NEXT: vmovlhps {{.*#+}} xmm8 = xmm15[0],xmm14[0]
1246 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm15[1],xmm14[1]
1247 ; AVX-NEXT: vmovaps 96(%rsi), %xmm15
1248 ; AVX-NEXT: vmovaps 96(%rdi), %xmm0
1249 ; AVX-NEXT: vmovlhps {{.*#+}} xmm7 = xmm0[0],xmm15[0]
1250 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm0[1],xmm15[1]
1251 ; AVX-NEXT: vmovaps 96(%rcx), %xmm15
1252 ; AVX-NEXT: vmovaps 96(%rdx), %xmm0
1253 ; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm15[0]
1254 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm0[1],xmm15[1]
1255 ; AVX-NEXT: vmovaps (%rsi), %xmm15
1256 ; AVX-NEXT: vmovaps (%rdi), %xmm0
1257 ; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm15[0]
1258 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm15[1]
1259 ; AVX-NEXT: vmovaps (%rcx), %xmm15
1260 ; AVX-NEXT: vmovaps (%rdx), %xmm0
1261 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm15[0]
1262 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1]
1263 ; AVX-NEXT: vmovaps %xmm0, 48(%r8)
1264 ; AVX-NEXT: vmovaps %xmm2, 32(%r8)
1265 ; AVX-NEXT: vmovaps %xmm1, 16(%r8)
1266 ; AVX-NEXT: vmovaps %xmm3, (%r8)
1267 ; AVX-NEXT: vmovaps %xmm4, 432(%r8)
1268 ; AVX-NEXT: vmovaps %xmm6, 416(%r8)
1269 ; AVX-NEXT: vmovaps %xmm5, 400(%r8)
1270 ; AVX-NEXT: vmovaps %xmm7, 384(%r8)
1271 ; AVX-NEXT: vmovaps %xmm14, 176(%r8)
1272 ; AVX-NEXT: vmovaps %xmm13, 160(%r8)
1273 ; AVX-NEXT: vmovaps %xmm8, 144(%r8)
1274 ; AVX-NEXT: vmovaps %xmm12, 128(%r8)
1275 ; AVX-NEXT: vmovaps %xmm11, 304(%r8)
1276 ; AVX-NEXT: vmovaps %xmm10, 288(%r8)
1277 ; AVX-NEXT: vmovaps %xmm9, 272(%r8)
1278 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1279 ; AVX-NEXT: vmovaps %xmm0, 256(%r8)
1280 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1281 ; AVX-NEXT: vmovaps %ymm0, 480(%r8)
1282 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1283 ; AVX-NEXT: vmovaps %ymm0, 448(%r8)
1284 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1285 ; AVX-NEXT: vmovaps %ymm0, 352(%r8)
1286 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1287 ; AVX-NEXT: vmovaps %ymm0, 320(%r8)
1288 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1289 ; AVX-NEXT: vmovaps %ymm0, 224(%r8)
1290 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1291 ; AVX-NEXT: vmovaps %ymm0, 192(%r8)
1292 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1293 ; AVX-NEXT: vmovaps %ymm0, 96(%r8)
1294 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1295 ; AVX-NEXT: vmovaps %ymm0, 64(%r8)
1296 ; AVX-NEXT: addq $152, %rsp
1297 ; AVX-NEXT: vzeroupper
1300 ; AVX2-LABEL: store_i64_stride4_vf16:
1302 ; AVX2-NEXT: pushq %rax
1303 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm4
1304 ; AVX2-NEXT: vmovaps (%rdi), %ymm8
1305 ; AVX2-NEXT: vmovaps 32(%rsi), %ymm5
1306 ; AVX2-NEXT: vmovaps (%rsi), %ymm9
1307 ; AVX2-NEXT: vmovaps (%rdx), %ymm11
1308 ; AVX2-NEXT: vmovaps (%rcx), %ymm13
1309 ; AVX2-NEXT: vmovaps (%rsi), %xmm0
1310 ; AVX2-NEXT: vmovaps 32(%rsi), %xmm2
1311 ; AVX2-NEXT: vmovaps 64(%rsi), %xmm6
1312 ; AVX2-NEXT: vinsertf128 $1, (%rcx), %ymm0, %ymm1
1313 ; AVX2-NEXT: vmovaps (%rdi), %xmm0
1314 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm3
1315 ; AVX2-NEXT: vmovaps 64(%rdi), %xmm7
1316 ; AVX2-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm10
1317 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm1[0],ymm10[2],ymm1[2]
1318 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1319 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm1[1],ymm10[3],ymm1[3]
1320 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1321 ; AVX2-NEXT: vinsertf128 $1, 32(%rcx), %ymm2, %ymm10
1322 ; AVX2-NEXT: vinsertf128 $1, 32(%rdx), %ymm3, %ymm3
1323 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm10[0],ymm3[2],ymm10[2]
1324 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1325 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm10[1],ymm3[3],ymm10[3]
1326 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1327 ; AVX2-NEXT: vinsertf128 $1, 64(%rcx), %ymm6, %ymm10
1328 ; AVX2-NEXT: vinsertf128 $1, 64(%rdx), %ymm7, %ymm7
1329 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm7[0],ymm10[0],ymm7[2],ymm10[2]
1330 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm10[1],ymm7[3],ymm10[3]
1331 ; AVX2-NEXT: vmovaps 96(%rsi), %xmm10
1332 ; AVX2-NEXT: vinsertf128 $1, 96(%rcx), %ymm10, %ymm12
1333 ; AVX2-NEXT: vmovaps 96(%rdi), %xmm10
1334 ; AVX2-NEXT: vinsertf128 $1, 96(%rdx), %ymm10, %ymm14
1335 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm14[0],ymm12[0],ymm14[2],ymm12[2]
1336 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm12[1],ymm14[3],ymm12[3]
1337 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm11[0],ymm13[0],ymm11[2],ymm13[2]
1338 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm8[0],ymm9[0],ymm8[2],ymm9[2]
1339 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm15[2,3],ymm14[2,3]
1340 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm15
1341 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm13[1],ymm11[3],ymm13[3]
1342 ; AVX2-NEXT: vmovaps 32(%rcx), %ymm13
1343 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm9[1],ymm8[3],ymm9[3]
1344 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm11[2,3]
1345 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm15[0],ymm13[0],ymm15[2],ymm13[2]
1346 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
1347 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3]
1348 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm11
1349 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm15[1],ymm13[1],ymm15[3],ymm13[3]
1350 ; AVX2-NEXT: vmovaps 64(%rsi), %ymm15
1351 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
1352 ; AVX2-NEXT: vmovaps 64(%rdx), %ymm5
1353 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm13[2,3]
1354 ; AVX2-NEXT: vmovaps 64(%rcx), %ymm13
1355 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm13[0],ymm5[2],ymm13[2]
1356 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm11[0],ymm15[0],ymm11[2],ymm15[2]
1357 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3]
1358 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm13[1],ymm5[3],ymm13[3]
1359 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm11[1],ymm15[1],ymm11[3],ymm15[3]
1360 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm11
1361 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3]
1362 ; AVX2-NEXT: vmovaps 96(%rsi), %ymm5
1363 ; AVX2-NEXT: vmovaps 96(%rdx), %ymm13
1364 ; AVX2-NEXT: vmovaps 96(%rcx), %ymm15
1365 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm15[0],ymm13[2],ymm15[2]
1366 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm11[0],ymm5[0],ymm11[2],ymm5[2]
1367 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
1368 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm13[1],ymm15[1],ymm13[3],ymm15[3]
1369 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3]
1370 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3]
1371 ; AVX2-NEXT: vmovaps %ymm3, 480(%r8)
1372 ; AVX2-NEXT: vmovaps %ymm0, 448(%r8)
1373 ; AVX2-NEXT: vmovaps %ymm1, 352(%r8)
1374 ; AVX2-NEXT: vmovaps %ymm2, 320(%r8)
1375 ; AVX2-NEXT: vmovaps %ymm4, 224(%r8)
1376 ; AVX2-NEXT: vmovaps %ymm9, 192(%r8)
1377 ; AVX2-NEXT: vmovaps %ymm8, 96(%r8)
1378 ; AVX2-NEXT: vmovaps %ymm14, 64(%r8)
1379 ; AVX2-NEXT: vmovaps %ymm12, 416(%r8)
1380 ; AVX2-NEXT: vmovaps %ymm10, 384(%r8)
1381 ; AVX2-NEXT: vmovaps %ymm7, 288(%r8)
1382 ; AVX2-NEXT: vmovaps %ymm6, 256(%r8)
1383 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1384 ; AVX2-NEXT: vmovaps %ymm0, 160(%r8)
1385 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1386 ; AVX2-NEXT: vmovaps %ymm0, 128(%r8)
1387 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1388 ; AVX2-NEXT: vmovaps %ymm0, 32(%r8)
1389 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1390 ; AVX2-NEXT: vmovaps %ymm0, (%r8)
1391 ; AVX2-NEXT: popq %rax
1392 ; AVX2-NEXT: vzeroupper
1395 ; AVX2-FP-LABEL: store_i64_stride4_vf16:
1397 ; AVX2-FP-NEXT: pushq %rax
1398 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm4
1399 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm8
1400 ; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm5
1401 ; AVX2-FP-NEXT: vmovaps (%rsi), %ymm9
1402 ; AVX2-FP-NEXT: vmovaps (%rdx), %ymm11
1403 ; AVX2-FP-NEXT: vmovaps (%rcx), %ymm13
1404 ; AVX2-FP-NEXT: vmovaps (%rsi), %xmm0
1405 ; AVX2-FP-NEXT: vmovaps 32(%rsi), %xmm2
1406 ; AVX2-FP-NEXT: vmovaps 64(%rsi), %xmm6
1407 ; AVX2-FP-NEXT: vinsertf128 $1, (%rcx), %ymm0, %ymm1
1408 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm0
1409 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm3
1410 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm7
1411 ; AVX2-FP-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm10
1412 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm1[0],ymm10[2],ymm1[2]
1413 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1414 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm1[1],ymm10[3],ymm1[3]
1415 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1416 ; AVX2-FP-NEXT: vinsertf128 $1, 32(%rcx), %ymm2, %ymm10
1417 ; AVX2-FP-NEXT: vinsertf128 $1, 32(%rdx), %ymm3, %ymm3
1418 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm10[0],ymm3[2],ymm10[2]
1419 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1420 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm10[1],ymm3[3],ymm10[3]
1421 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1422 ; AVX2-FP-NEXT: vinsertf128 $1, 64(%rcx), %ymm6, %ymm10
1423 ; AVX2-FP-NEXT: vinsertf128 $1, 64(%rdx), %ymm7, %ymm7
1424 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm7[0],ymm10[0],ymm7[2],ymm10[2]
1425 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm10[1],ymm7[3],ymm10[3]
1426 ; AVX2-FP-NEXT: vmovaps 96(%rsi), %xmm10
1427 ; AVX2-FP-NEXT: vinsertf128 $1, 96(%rcx), %ymm10, %ymm12
1428 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm10
1429 ; AVX2-FP-NEXT: vinsertf128 $1, 96(%rdx), %ymm10, %ymm14
1430 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm14[0],ymm12[0],ymm14[2],ymm12[2]
1431 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm12[1],ymm14[3],ymm12[3]
1432 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm11[0],ymm13[0],ymm11[2],ymm13[2]
1433 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm8[0],ymm9[0],ymm8[2],ymm9[2]
1434 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm15[2,3],ymm14[2,3]
1435 ; AVX2-FP-NEXT: vmovaps 32(%rdx), %ymm15
1436 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm13[1],ymm11[3],ymm13[3]
1437 ; AVX2-FP-NEXT: vmovaps 32(%rcx), %ymm13
1438 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm9[1],ymm8[3],ymm9[3]
1439 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm11[2,3]
1440 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm15[0],ymm13[0],ymm15[2],ymm13[2]
1441 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
1442 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3]
1443 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm11
1444 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm15[1],ymm13[1],ymm15[3],ymm13[3]
1445 ; AVX2-FP-NEXT: vmovaps 64(%rsi), %ymm15
1446 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
1447 ; AVX2-FP-NEXT: vmovaps 64(%rdx), %ymm5
1448 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm13[2,3]
1449 ; AVX2-FP-NEXT: vmovaps 64(%rcx), %ymm13
1450 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm13[0],ymm5[2],ymm13[2]
1451 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm11[0],ymm15[0],ymm11[2],ymm15[2]
1452 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3]
1453 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm13[1],ymm5[3],ymm13[3]
1454 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm11[1],ymm15[1],ymm11[3],ymm15[3]
1455 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm11
1456 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3]
1457 ; AVX2-FP-NEXT: vmovaps 96(%rsi), %ymm5
1458 ; AVX2-FP-NEXT: vmovaps 96(%rdx), %ymm13
1459 ; AVX2-FP-NEXT: vmovaps 96(%rcx), %ymm15
1460 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm15[0],ymm13[2],ymm15[2]
1461 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm11[0],ymm5[0],ymm11[2],ymm5[2]
1462 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
1463 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm13[1],ymm15[1],ymm13[3],ymm15[3]
1464 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3]
1465 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3]
1466 ; AVX2-FP-NEXT: vmovaps %ymm3, 480(%r8)
1467 ; AVX2-FP-NEXT: vmovaps %ymm0, 448(%r8)
1468 ; AVX2-FP-NEXT: vmovaps %ymm1, 352(%r8)
1469 ; AVX2-FP-NEXT: vmovaps %ymm2, 320(%r8)
1470 ; AVX2-FP-NEXT: vmovaps %ymm4, 224(%r8)
1471 ; AVX2-FP-NEXT: vmovaps %ymm9, 192(%r8)
1472 ; AVX2-FP-NEXT: vmovaps %ymm8, 96(%r8)
1473 ; AVX2-FP-NEXT: vmovaps %ymm14, 64(%r8)
1474 ; AVX2-FP-NEXT: vmovaps %ymm12, 416(%r8)
1475 ; AVX2-FP-NEXT: vmovaps %ymm10, 384(%r8)
1476 ; AVX2-FP-NEXT: vmovaps %ymm7, 288(%r8)
1477 ; AVX2-FP-NEXT: vmovaps %ymm6, 256(%r8)
1478 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1479 ; AVX2-FP-NEXT: vmovaps %ymm0, 160(%r8)
1480 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1481 ; AVX2-FP-NEXT: vmovaps %ymm0, 128(%r8)
1482 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1483 ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%r8)
1484 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1485 ; AVX2-FP-NEXT: vmovaps %ymm0, (%r8)
1486 ; AVX2-FP-NEXT: popq %rax
1487 ; AVX2-FP-NEXT: vzeroupper
1488 ; AVX2-FP-NEXT: retq
1490 ; AVX2-FCP-LABEL: store_i64_stride4_vf16:
1491 ; AVX2-FCP: # %bb.0:
1492 ; AVX2-FCP-NEXT: pushq %rax
1493 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm4
1494 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm8
1495 ; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm5
1496 ; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm9
1497 ; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm11
1498 ; AVX2-FCP-NEXT: vmovaps (%rcx), %ymm13
1499 ; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm0
1500 ; AVX2-FCP-NEXT: vmovaps 32(%rsi), %xmm2
1501 ; AVX2-FCP-NEXT: vmovaps 64(%rsi), %xmm6
1502 ; AVX2-FCP-NEXT: vinsertf128 $1, (%rcx), %ymm0, %ymm1
1503 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm0
1504 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm3
1505 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm7
1506 ; AVX2-FCP-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm10
1507 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm1[0],ymm10[2],ymm1[2]
1508 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1509 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm1[1],ymm10[3],ymm1[3]
1510 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1511 ; AVX2-FCP-NEXT: vinsertf128 $1, 32(%rcx), %ymm2, %ymm10
1512 ; AVX2-FCP-NEXT: vinsertf128 $1, 32(%rdx), %ymm3, %ymm3
1513 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm10[0],ymm3[2],ymm10[2]
1514 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1515 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm10[1],ymm3[3],ymm10[3]
1516 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1517 ; AVX2-FCP-NEXT: vinsertf128 $1, 64(%rcx), %ymm6, %ymm10
1518 ; AVX2-FCP-NEXT: vinsertf128 $1, 64(%rdx), %ymm7, %ymm7
1519 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm7[0],ymm10[0],ymm7[2],ymm10[2]
1520 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm10[1],ymm7[3],ymm10[3]
1521 ; AVX2-FCP-NEXT: vmovaps 96(%rsi), %xmm10
1522 ; AVX2-FCP-NEXT: vinsertf128 $1, 96(%rcx), %ymm10, %ymm12
1523 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm10
1524 ; AVX2-FCP-NEXT: vinsertf128 $1, 96(%rdx), %ymm10, %ymm14
1525 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm14[0],ymm12[0],ymm14[2],ymm12[2]
1526 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm12[1],ymm14[3],ymm12[3]
1527 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm11[0],ymm13[0],ymm11[2],ymm13[2]
1528 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm8[0],ymm9[0],ymm8[2],ymm9[2]
1529 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm15[2,3],ymm14[2,3]
1530 ; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm15
1531 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm13[1],ymm11[3],ymm13[3]
1532 ; AVX2-FCP-NEXT: vmovaps 32(%rcx), %ymm13
1533 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm9[1],ymm8[3],ymm9[3]
1534 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm11[2,3]
1535 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm15[0],ymm13[0],ymm15[2],ymm13[2]
1536 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
1537 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3]
1538 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm11
1539 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm15[1],ymm13[1],ymm15[3],ymm13[3]
1540 ; AVX2-FCP-NEXT: vmovaps 64(%rsi), %ymm15
1541 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
1542 ; AVX2-FCP-NEXT: vmovaps 64(%rdx), %ymm5
1543 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm13[2,3]
1544 ; AVX2-FCP-NEXT: vmovaps 64(%rcx), %ymm13
1545 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm13[0],ymm5[2],ymm13[2]
1546 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm11[0],ymm15[0],ymm11[2],ymm15[2]
1547 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3]
1548 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm13[1],ymm5[3],ymm13[3]
1549 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm11[1],ymm15[1],ymm11[3],ymm15[3]
1550 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm11
1551 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3]
1552 ; AVX2-FCP-NEXT: vmovaps 96(%rsi), %ymm5
1553 ; AVX2-FCP-NEXT: vmovaps 96(%rdx), %ymm13
1554 ; AVX2-FCP-NEXT: vmovaps 96(%rcx), %ymm15
1555 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm15[0],ymm13[2],ymm15[2]
1556 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm11[0],ymm5[0],ymm11[2],ymm5[2]
1557 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
1558 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm13[1],ymm15[1],ymm13[3],ymm15[3]
1559 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3]
1560 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3]
1561 ; AVX2-FCP-NEXT: vmovaps %ymm3, 480(%r8)
1562 ; AVX2-FCP-NEXT: vmovaps %ymm0, 448(%r8)
1563 ; AVX2-FCP-NEXT: vmovaps %ymm1, 352(%r8)
1564 ; AVX2-FCP-NEXT: vmovaps %ymm2, 320(%r8)
1565 ; AVX2-FCP-NEXT: vmovaps %ymm4, 224(%r8)
1566 ; AVX2-FCP-NEXT: vmovaps %ymm9, 192(%r8)
1567 ; AVX2-FCP-NEXT: vmovaps %ymm8, 96(%r8)
1568 ; AVX2-FCP-NEXT: vmovaps %ymm14, 64(%r8)
1569 ; AVX2-FCP-NEXT: vmovaps %ymm12, 416(%r8)
1570 ; AVX2-FCP-NEXT: vmovaps %ymm10, 384(%r8)
1571 ; AVX2-FCP-NEXT: vmovaps %ymm7, 288(%r8)
1572 ; AVX2-FCP-NEXT: vmovaps %ymm6, 256(%r8)
1573 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1574 ; AVX2-FCP-NEXT: vmovaps %ymm0, 160(%r8)
1575 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1576 ; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%r8)
1577 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1578 ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%r8)
1579 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1580 ; AVX2-FCP-NEXT: vmovaps %ymm0, (%r8)
1581 ; AVX2-FCP-NEXT: popq %rax
1582 ; AVX2-FCP-NEXT: vzeroupper
1583 ; AVX2-FCP-NEXT: retq
1585 ; AVX512-LABEL: store_i64_stride4_vf16:
1587 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
1588 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1
1589 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm2
1590 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm3
1591 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm4
1592 ; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm5
1593 ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm6
1594 ; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm7
1595 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,3,11]
1596 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm9
1597 ; AVX512-NEXT: vpermt2q %zmm6, %zmm8, %zmm9
1598 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,10,0,0,3,11,0,0]
1599 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11
1600 ; AVX512-NEXT: vpermt2q %zmm2, %zmm10, %zmm11
1601 ; AVX512-NEXT: movb $-52, %al
1602 ; AVX512-NEXT: kmovw %eax, %k1
1603 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1}
1604 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9]
1605 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm12
1606 ; AVX512-NEXT: vpermt2q %zmm6, %zmm9, %zmm12
1607 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,8,0,0,1,9,0,0]
1608 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm14
1609 ; AVX512-NEXT: vpermt2q %zmm2, %zmm13, %zmm14
1610 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1}
1611 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,6,14,0,0,7,15]
1612 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm15
1613 ; AVX512-NEXT: vpermt2q %zmm6, %zmm12, %zmm15
1614 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,7,15,0,0]
1615 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm17
1616 ; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm17
1617 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1}
1618 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13]
1619 ; AVX512-NEXT: vpermt2q %zmm6, %zmm15, %zmm4
1620 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [4,12,0,0,5,13,0,0]
1621 ; AVX512-NEXT: vpermt2q %zmm2, %zmm6, %zmm0
1622 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1}
1623 ; AVX512-NEXT: vpermi2q %zmm7, %zmm5, %zmm8
1624 ; AVX512-NEXT: vpermi2q %zmm3, %zmm1, %zmm10
1625 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
1626 ; AVX512-NEXT: vpermi2q %zmm7, %zmm5, %zmm9
1627 ; AVX512-NEXT: vpermi2q %zmm3, %zmm1, %zmm13
1628 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm13 {%k1}
1629 ; AVX512-NEXT: vpermi2q %zmm7, %zmm5, %zmm12
1630 ; AVX512-NEXT: vpermi2q %zmm3, %zmm1, %zmm16
1631 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1}
1632 ; AVX512-NEXT: vpermt2q %zmm7, %zmm15, %zmm5
1633 ; AVX512-NEXT: vpermt2q %zmm3, %zmm6, %zmm1
1634 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1}
1635 ; AVX512-NEXT: vmovdqa64 %zmm1, 384(%r8)
1636 ; AVX512-NEXT: vmovdqa64 %zmm16, 448(%r8)
1637 ; AVX512-NEXT: vmovdqa64 %zmm13, 256(%r8)
1638 ; AVX512-NEXT: vmovdqa64 %zmm10, 320(%r8)
1639 ; AVX512-NEXT: vmovdqa64 %zmm0, 128(%r8)
1640 ; AVX512-NEXT: vmovdqa64 %zmm17, 192(%r8)
1641 ; AVX512-NEXT: vmovdqa64 %zmm14, (%r8)
1642 ; AVX512-NEXT: vmovdqa64 %zmm11, 64(%r8)
1643 ; AVX512-NEXT: vzeroupper
1646 ; AVX512-FCP-LABEL: store_i64_stride4_vf16:
1647 ; AVX512-FCP: # %bb.0:
1648 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1649 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
1650 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm2
1651 ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3
1652 ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm4
1653 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5
1654 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm6
1655 ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm7
1656 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,3,11]
1657 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm9
1658 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9
1659 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,10,0,0,3,11,0,0]
1660 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm11
1661 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm11
1662 ; AVX512-FCP-NEXT: movb $-52, %al
1663 ; AVX512-FCP-NEXT: kmovw %eax, %k1
1664 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1}
1665 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9]
1666 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm12
1667 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm12
1668 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,8,0,0,1,9,0,0]
1669 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14
1670 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm14
1671 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1}
1672 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,6,14,0,0,7,15]
1673 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm15
1674 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm15
1675 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,7,15,0,0]
1676 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm17
1677 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm17
1678 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1}
1679 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13]
1680 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm4
1681 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [4,12,0,0,5,13,0,0]
1682 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm0
1683 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1}
1684 ; AVX512-FCP-NEXT: vpermi2q %zmm7, %zmm5, %zmm8
1685 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm10
1686 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
1687 ; AVX512-FCP-NEXT: vpermi2q %zmm7, %zmm5, %zmm9
1688 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm13
1689 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 {%k1}
1690 ; AVX512-FCP-NEXT: vpermi2q %zmm7, %zmm5, %zmm12
1691 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm16
1692 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1}
1693 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm15, %zmm5
1694 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm1
1695 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1}
1696 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 384(%r8)
1697 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 448(%r8)
1698 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 256(%r8)
1699 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 320(%r8)
1700 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%r8)
1701 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 192(%r8)
1702 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, (%r8)
1703 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 64(%r8)
1704 ; AVX512-FCP-NEXT: vzeroupper
1705 ; AVX512-FCP-NEXT: retq
1707 ; AVX512DQ-LABEL: store_i64_stride4_vf16:
1708 ; AVX512DQ: # %bb.0:
1709 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0
1710 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1
1711 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm2
1712 ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm3
1713 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm4
1714 ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm5
1715 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm6
1716 ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm7
1717 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,3,11]
1718 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm9
1719 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm8, %zmm9
1720 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,10,0,0,3,11,0,0]
1721 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm11
1722 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm10, %zmm11
1723 ; AVX512DQ-NEXT: movb $-52, %al
1724 ; AVX512DQ-NEXT: kmovw %eax, %k1
1725 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1}
1726 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9]
1727 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm12
1728 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm9, %zmm12
1729 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,8,0,0,1,9,0,0]
1730 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm14
1731 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm13, %zmm14
1732 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1}
1733 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,6,14,0,0,7,15]
1734 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm15
1735 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm12, %zmm15
1736 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,7,15,0,0]
1737 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm17
1738 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm17
1739 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1}
1740 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13]
1741 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm15, %zmm4
1742 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [4,12,0,0,5,13,0,0]
1743 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm6, %zmm0
1744 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1}
1745 ; AVX512DQ-NEXT: vpermi2q %zmm7, %zmm5, %zmm8
1746 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm1, %zmm10
1747 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
1748 ; AVX512DQ-NEXT: vpermi2q %zmm7, %zmm5, %zmm9
1749 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm1, %zmm13
1750 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm13 {%k1}
1751 ; AVX512DQ-NEXT: vpermi2q %zmm7, %zmm5, %zmm12
1752 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm1, %zmm16
1753 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1}
1754 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm15, %zmm5
1755 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm6, %zmm1
1756 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1}
1757 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 384(%r8)
1758 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 448(%r8)
1759 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 256(%r8)
1760 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 320(%r8)
1761 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%r8)
1762 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, 192(%r8)
1763 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, (%r8)
1764 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 64(%r8)
1765 ; AVX512DQ-NEXT: vzeroupper
1766 ; AVX512DQ-NEXT: retq
1768 ; AVX512DQ-FCP-LABEL: store_i64_stride4_vf16:
1769 ; AVX512DQ-FCP: # %bb.0:
1770 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1771 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
1772 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm2
1773 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3
1774 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm4
1775 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5
1776 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm6
1777 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm7
1778 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,3,11]
1779 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm9
1780 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9
1781 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,10,0,0,3,11,0,0]
1782 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm11
1783 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm11
1784 ; AVX512DQ-FCP-NEXT: movb $-52, %al
1785 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
1786 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1}
1787 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9]
1788 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm12
1789 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm12
1790 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,8,0,0,1,9,0,0]
1791 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm14
1792 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm14
1793 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1}
1794 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,6,14,0,0,7,15]
1795 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm15
1796 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm15
1797 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,7,15,0,0]
1798 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm17
1799 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm17
1800 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1}
1801 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13]
1802 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm4
1803 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [4,12,0,0,5,13,0,0]
1804 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm0
1805 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1}
1806 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm5, %zmm8
1807 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm10
1808 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
1809 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm5, %zmm9
1810 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm13
1811 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 {%k1}
1812 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm5, %zmm12
1813 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm16
1814 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1}
1815 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm15, %zmm5
1816 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm1
1817 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1}
1818 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 384(%r8)
1819 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 448(%r8)
1820 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 256(%r8)
1821 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 320(%r8)
1822 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%r8)
1823 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 192(%r8)
1824 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, (%r8)
1825 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 64(%r8)
1826 ; AVX512DQ-FCP-NEXT: vzeroupper
1827 ; AVX512DQ-FCP-NEXT: retq
1829 ; AVX512BW-LABEL: store_i64_stride4_vf16:
1830 ; AVX512BW: # %bb.0:
1831 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1832 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
1833 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm2
1834 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3
1835 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm4
1836 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5
1837 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm6
1838 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm7
1839 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,3,11]
1840 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9
1841 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm9
1842 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,10,0,0,3,11,0,0]
1843 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11
1844 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm11
1845 ; AVX512BW-NEXT: movb $-52, %al
1846 ; AVX512BW-NEXT: kmovd %eax, %k1
1847 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1}
1848 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9]
1849 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12
1850 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm12
1851 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,8,0,0,1,9,0,0]
1852 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14
1853 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm14
1854 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1}
1855 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,6,14,0,0,7,15]
1856 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15
1857 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm15
1858 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,7,15,0,0]
1859 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17
1860 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm17
1861 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1}
1862 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13]
1863 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm4
1864 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [4,12,0,0,5,13,0,0]
1865 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm0
1866 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1}
1867 ; AVX512BW-NEXT: vpermi2q %zmm7, %zmm5, %zmm8
1868 ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm10
1869 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
1870 ; AVX512BW-NEXT: vpermi2q %zmm7, %zmm5, %zmm9
1871 ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm13
1872 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13 {%k1}
1873 ; AVX512BW-NEXT: vpermi2q %zmm7, %zmm5, %zmm12
1874 ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm16
1875 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1}
1876 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm15, %zmm5
1877 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm1
1878 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1}
1879 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 384(%r8)
1880 ; AVX512BW-NEXT: vmovdqa64 %zmm16, 448(%r8)
1881 ; AVX512BW-NEXT: vmovdqa64 %zmm13, 256(%r8)
1882 ; AVX512BW-NEXT: vmovdqa64 %zmm10, 320(%r8)
1883 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%r8)
1884 ; AVX512BW-NEXT: vmovdqa64 %zmm17, 192(%r8)
1885 ; AVX512BW-NEXT: vmovdqa64 %zmm14, (%r8)
1886 ; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%r8)
1887 ; AVX512BW-NEXT: vzeroupper
1888 ; AVX512BW-NEXT: retq
1890 ; AVX512BW-FCP-LABEL: store_i64_stride4_vf16:
1891 ; AVX512BW-FCP: # %bb.0:
1892 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1893 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
1894 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm2
1895 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3
1896 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4
1897 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5
1898 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm6
1899 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm7
1900 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,3,11]
1901 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9
1902 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9
1903 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,10,0,0,3,11,0,0]
1904 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11
1905 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm11
1906 ; AVX512BW-FCP-NEXT: movb $-52, %al
1907 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1
1908 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1}
1909 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9]
1910 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm12
1911 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm12
1912 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,8,0,0,1,9,0,0]
1913 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14
1914 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm14
1915 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1}
1916 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,6,14,0,0,7,15]
1917 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm15
1918 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm15
1919 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,7,15,0,0]
1920 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17
1921 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm17
1922 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1}
1923 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13]
1924 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm4
1925 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [4,12,0,0,5,13,0,0]
1926 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm0
1927 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1}
1928 ; AVX512BW-FCP-NEXT: vpermi2q %zmm7, %zmm5, %zmm8
1929 ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm10
1930 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
1931 ; AVX512BW-FCP-NEXT: vpermi2q %zmm7, %zmm5, %zmm9
1932 ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm13
1933 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 {%k1}
1934 ; AVX512BW-FCP-NEXT: vpermi2q %zmm7, %zmm5, %zmm12
1935 ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm16
1936 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1}
1937 ; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm15, %zmm5
1938 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm1
1939 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1}
1940 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 384(%r8)
1941 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 448(%r8)
1942 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 256(%r8)
1943 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 320(%r8)
1944 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%r8)
1945 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 192(%r8)
1946 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%r8)
1947 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 64(%r8)
1948 ; AVX512BW-FCP-NEXT: vzeroupper
1949 ; AVX512BW-FCP-NEXT: retq
1951 ; AVX512DQ-BW-LABEL: store_i64_stride4_vf16:
1952 ; AVX512DQ-BW: # %bb.0:
1953 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
1954 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1
1955 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm2
1956 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm3
1957 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm4
1958 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm5
1959 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm6
1960 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm7
1961 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,3,11]
1962 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm9
1963 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm9
1964 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,10,0,0,3,11,0,0]
1965 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11
1966 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm11
1967 ; AVX512DQ-BW-NEXT: movb $-52, %al
1968 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1
1969 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1}
1970 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9]
1971 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm12
1972 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm12
1973 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,8,0,0,1,9,0,0]
1974 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14
1975 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm14
1976 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1}
1977 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,6,14,0,0,7,15]
1978 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm15
1979 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm15
1980 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,7,15,0,0]
1981 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm17
1982 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm17
1983 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1}
1984 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13]
1985 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm4
1986 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [4,12,0,0,5,13,0,0]
1987 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm0
1988 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1}
1989 ; AVX512DQ-BW-NEXT: vpermi2q %zmm7, %zmm5, %zmm8
1990 ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm10
1991 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
1992 ; AVX512DQ-BW-NEXT: vpermi2q %zmm7, %zmm5, %zmm9
1993 ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm13
1994 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm13 {%k1}
1995 ; AVX512DQ-BW-NEXT: vpermi2q %zmm7, %zmm5, %zmm12
1996 ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm16
1997 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1}
1998 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm15, %zmm5
1999 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm1
2000 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1}
2001 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 384(%r8)
2002 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 448(%r8)
2003 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 256(%r8)
2004 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 320(%r8)
2005 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 128(%r8)
2006 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 192(%r8)
2007 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, (%r8)
2008 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 64(%r8)
2009 ; AVX512DQ-BW-NEXT: vzeroupper
2010 ; AVX512DQ-BW-NEXT: retq
2012 ; AVX512DQ-BW-FCP-LABEL: store_i64_stride4_vf16:
2013 ; AVX512DQ-BW-FCP: # %bb.0:
2014 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
2015 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
2016 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm2
2017 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3
2018 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4
2019 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5
2020 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm6
2021 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm7
2022 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,3,11]
2023 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9
2024 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9
2025 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,10,0,0,3,11,0,0]
2026 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11
2027 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm11
2028 ; AVX512DQ-BW-FCP-NEXT: movb $-52, %al
2029 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
2030 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1}
2031 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9]
2032 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm12
2033 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm12
2034 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,8,0,0,1,9,0,0]
2035 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14
2036 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm14
2037 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1}
2038 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,6,14,0,0,7,15]
2039 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm15
2040 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm15
2041 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,7,15,0,0]
2042 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17
2043 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm17
2044 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1}
2045 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13]
2046 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm4
2047 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [4,12,0,0,5,13,0,0]
2048 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm0
2049 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1}
2050 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm7, %zmm5, %zmm8
2051 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm10
2052 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
2053 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm7, %zmm5, %zmm9
2054 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm13
2055 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 {%k1}
2056 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm7, %zmm5, %zmm12
2057 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm16
2058 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1}
2059 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm15, %zmm5
2060 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm1
2061 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1}
2062 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 384(%r8)
2063 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 448(%r8)
2064 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 256(%r8)
2065 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 320(%r8)
2066 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%r8)
2067 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 192(%r8)
2068 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%r8)
2069 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 64(%r8)
2070 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
2071 ; AVX512DQ-BW-FCP-NEXT: retq
2072 %in.vec0 = load <16 x i64>, ptr %in.vecptr0, align 64
2073 %in.vec1 = load <16 x i64>, ptr %in.vecptr1, align 64
2074 %in.vec2 = load <16 x i64>, ptr %in.vecptr2, align 64
2075 %in.vec3 = load <16 x i64>, ptr %in.vecptr3, align 64
2076 %1 = shufflevector <16 x i64> %in.vec0, <16 x i64> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2077 %2 = shufflevector <16 x i64> %in.vec2, <16 x i64> %in.vec3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2078 %3 = shufflevector <32 x i64> %1, <32 x i64> %2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
2079 %interleaved.vec = shufflevector <64 x i64> %3, <64 x i64> poison, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
2080 store <64 x i64> %interleaved.vec, ptr %out.vec, align 64
2084 define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %out.vec) nounwind {
2085 ; SSE-LABEL: store_i64_stride4_vf32:
2087 ; SSE-NEXT: subq $664, %rsp # imm = 0x298
2088 ; SSE-NEXT: movaps (%rdi), %xmm7
2089 ; SSE-NEXT: movaps 16(%rdi), %xmm8
2090 ; SSE-NEXT: movaps 32(%rdi), %xmm9
2091 ; SSE-NEXT: movaps 48(%rdi), %xmm10
2092 ; SSE-NEXT: movaps (%rsi), %xmm3
2093 ; SSE-NEXT: movaps 16(%rsi), %xmm2
2094 ; SSE-NEXT: movaps 32(%rsi), %xmm1
2095 ; SSE-NEXT: movaps 48(%rsi), %xmm0
2096 ; SSE-NEXT: movaps (%rdx), %xmm11
2097 ; SSE-NEXT: movaps 16(%rdx), %xmm12
2098 ; SSE-NEXT: movaps 32(%rdx), %xmm13
2099 ; SSE-NEXT: movaps 48(%rdx), %xmm14
2100 ; SSE-NEXT: movaps (%rcx), %xmm4
2101 ; SSE-NEXT: movaps 16(%rcx), %xmm5
2102 ; SSE-NEXT: movaps 32(%rcx), %xmm6
2103 ; SSE-NEXT: movaps %xmm7, %xmm15
2104 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm3[0]
2105 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2106 ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1]
2107 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2108 ; SSE-NEXT: movaps %xmm11, %xmm7
2109 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm4[0]
2110 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2111 ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm4[1]
2112 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2113 ; SSE-NEXT: movaps %xmm8, %xmm3
2114 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
2115 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2116 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1]
2117 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2118 ; SSE-NEXT: movaps %xmm12, %xmm2
2119 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm5[0]
2120 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2121 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1]
2122 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2123 ; SSE-NEXT: movaps %xmm9, %xmm2
2124 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
2125 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2126 ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm1[1]
2127 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2128 ; SSE-NEXT: movaps %xmm13, %xmm1
2129 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0]
2130 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2131 ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm6[1]
2132 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2133 ; SSE-NEXT: movaps %xmm10, %xmm1
2134 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2135 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2136 ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1]
2137 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2138 ; SSE-NEXT: movaps 48(%rcx), %xmm0
2139 ; SSE-NEXT: movaps %xmm14, %xmm1
2140 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2141 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2142 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1]
2143 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2144 ; SSE-NEXT: movaps 64(%rdi), %xmm2
2145 ; SSE-NEXT: movaps 64(%rsi), %xmm0
2146 ; SSE-NEXT: movaps %xmm2, %xmm1
2147 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2148 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2149 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
2150 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2151 ; SSE-NEXT: movaps 64(%rdx), %xmm2
2152 ; SSE-NEXT: movaps 64(%rcx), %xmm0
2153 ; SSE-NEXT: movaps %xmm2, %xmm1
2154 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2155 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2156 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
2157 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2158 ; SSE-NEXT: movaps 80(%rdi), %xmm2
2159 ; SSE-NEXT: movaps 80(%rsi), %xmm0
2160 ; SSE-NEXT: movaps %xmm2, %xmm1
2161 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2162 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2163 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
2164 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2165 ; SSE-NEXT: movaps 80(%rdx), %xmm2
2166 ; SSE-NEXT: movaps 80(%rcx), %xmm0
2167 ; SSE-NEXT: movaps %xmm2, %xmm1
2168 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2169 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2170 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
2171 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2172 ; SSE-NEXT: movaps 96(%rdi), %xmm2
2173 ; SSE-NEXT: movaps 96(%rsi), %xmm0
2174 ; SSE-NEXT: movaps %xmm2, %xmm1
2175 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2176 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2177 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
2178 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2179 ; SSE-NEXT: movaps 96(%rdx), %xmm2
2180 ; SSE-NEXT: movaps 96(%rcx), %xmm0
2181 ; SSE-NEXT: movaps %xmm2, %xmm1
2182 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2183 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2184 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
2185 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2186 ; SSE-NEXT: movaps 112(%rdi), %xmm2
2187 ; SSE-NEXT: movaps 112(%rsi), %xmm0
2188 ; SSE-NEXT: movaps %xmm2, %xmm1
2189 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2190 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2191 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
2192 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2193 ; SSE-NEXT: movaps 112(%rdx), %xmm2
2194 ; SSE-NEXT: movaps 112(%rcx), %xmm0
2195 ; SSE-NEXT: movaps %xmm2, %xmm1
2196 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2197 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2198 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
2199 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2200 ; SSE-NEXT: movaps 128(%rdi), %xmm2
2201 ; SSE-NEXT: movaps 128(%rsi), %xmm0
2202 ; SSE-NEXT: movaps %xmm2, %xmm1
2203 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2204 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2205 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
2206 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2207 ; SSE-NEXT: movaps 128(%rdx), %xmm2
2208 ; SSE-NEXT: movaps 128(%rcx), %xmm0
2209 ; SSE-NEXT: movaps %xmm2, %xmm1
2210 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2211 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2212 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
2213 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2214 ; SSE-NEXT: movaps 144(%rdi), %xmm2
2215 ; SSE-NEXT: movaps 144(%rsi), %xmm0
2216 ; SSE-NEXT: movaps %xmm2, %xmm1
2217 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2218 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2219 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
2220 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2221 ; SSE-NEXT: movaps 144(%rdx), %xmm2
2222 ; SSE-NEXT: movaps 144(%rcx), %xmm0
2223 ; SSE-NEXT: movaps %xmm2, %xmm1
2224 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2225 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2226 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
2227 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2228 ; SSE-NEXT: movaps 160(%rdi), %xmm2
2229 ; SSE-NEXT: movaps 160(%rsi), %xmm0
2230 ; SSE-NEXT: movaps %xmm2, %xmm1
2231 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2232 ; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
2233 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
2234 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2235 ; SSE-NEXT: movaps 160(%rdx), %xmm2
2236 ; SSE-NEXT: movaps 160(%rcx), %xmm0
2237 ; SSE-NEXT: movaps %xmm2, %xmm1
2238 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2239 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2240 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
2241 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2242 ; SSE-NEXT: movaps 176(%rdi), %xmm2
2243 ; SSE-NEXT: movaps 176(%rsi), %xmm0
2244 ; SSE-NEXT: movaps %xmm2, %xmm1
2245 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2246 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2247 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
2248 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2249 ; SSE-NEXT: movaps 176(%rdx), %xmm14
2250 ; SSE-NEXT: movaps 176(%rcx), %xmm0
2251 ; SSE-NEXT: movaps %xmm14, %xmm1
2252 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2253 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2254 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1]
2255 ; SSE-NEXT: movaps 192(%rdi), %xmm12
2256 ; SSE-NEXT: movaps 192(%rsi), %xmm0
2257 ; SSE-NEXT: movaps %xmm12, %xmm1
2258 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2259 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2260 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1]
2261 ; SSE-NEXT: movaps 192(%rdx), %xmm15
2262 ; SSE-NEXT: movaps 192(%rcx), %xmm0
2263 ; SSE-NEXT: movaps %xmm15, %xmm1
2264 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2265 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2266 ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1]
2267 ; SSE-NEXT: movaps 208(%rdi), %xmm10
2268 ; SSE-NEXT: movaps 208(%rsi), %xmm0
2269 ; SSE-NEXT: movaps %xmm10, %xmm13
2270 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0]
2271 ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1]
2272 ; SSE-NEXT: movaps 208(%rdx), %xmm7
2273 ; SSE-NEXT: movaps 208(%rcx), %xmm0
2274 ; SSE-NEXT: movaps %xmm7, %xmm11
2275 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0]
2276 ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1]
2277 ; SSE-NEXT: movaps 224(%rdi), %xmm8
2278 ; SSE-NEXT: movaps 224(%rsi), %xmm0
2279 ; SSE-NEXT: movaps %xmm8, %xmm9
2280 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0]
2281 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1]
2282 ; SSE-NEXT: movaps 224(%rdx), %xmm4
2283 ; SSE-NEXT: movaps 224(%rcx), %xmm0
2284 ; SSE-NEXT: movaps %xmm4, %xmm6
2285 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0]
2286 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
2287 ; SSE-NEXT: movaps 240(%rdi), %xmm2
2288 ; SSE-NEXT: movaps 240(%rsi), %xmm3
2289 ; SSE-NEXT: movaps %xmm2, %xmm5
2290 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0]
2291 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
2292 ; SSE-NEXT: movaps 240(%rdx), %xmm3
2293 ; SSE-NEXT: movaps 240(%rcx), %xmm1
2294 ; SSE-NEXT: movaps %xmm3, %xmm0
2295 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2296 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
2297 ; SSE-NEXT: movaps %xmm3, 1008(%r8)
2298 ; SSE-NEXT: movaps %xmm2, 992(%r8)
2299 ; SSE-NEXT: movaps %xmm0, 976(%r8)
2300 ; SSE-NEXT: movaps %xmm5, 960(%r8)
2301 ; SSE-NEXT: movaps %xmm4, 944(%r8)
2302 ; SSE-NEXT: movaps %xmm8, 928(%r8)
2303 ; SSE-NEXT: movaps %xmm6, 912(%r8)
2304 ; SSE-NEXT: movaps %xmm9, 896(%r8)
2305 ; SSE-NEXT: movaps %xmm7, 880(%r8)
2306 ; SSE-NEXT: movaps %xmm10, 864(%r8)
2307 ; SSE-NEXT: movaps %xmm11, 848(%r8)
2308 ; SSE-NEXT: movaps %xmm13, 832(%r8)
2309 ; SSE-NEXT: movaps %xmm15, 816(%r8)
2310 ; SSE-NEXT: movaps %xmm12, 800(%r8)
2311 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2312 ; SSE-NEXT: movaps %xmm0, 784(%r8)
2313 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2314 ; SSE-NEXT: movaps %xmm0, 768(%r8)
2315 ; SSE-NEXT: movaps %xmm14, 752(%r8)
2316 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2317 ; SSE-NEXT: movaps %xmm0, 736(%r8)
2318 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2319 ; SSE-NEXT: movaps %xmm0, 720(%r8)
2320 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2321 ; SSE-NEXT: movaps %xmm0, 704(%r8)
2322 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2323 ; SSE-NEXT: movaps %xmm0, 688(%r8)
2324 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2325 ; SSE-NEXT: movaps %xmm0, 672(%r8)
2326 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2327 ; SSE-NEXT: movaps %xmm0, 656(%r8)
2328 ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
2329 ; SSE-NEXT: movaps %xmm0, 640(%r8)
2330 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2331 ; SSE-NEXT: movaps %xmm0, 624(%r8)
2332 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2333 ; SSE-NEXT: movaps %xmm0, 608(%r8)
2334 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2335 ; SSE-NEXT: movaps %xmm0, 592(%r8)
2336 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2337 ; SSE-NEXT: movaps %xmm0, 576(%r8)
2338 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2339 ; SSE-NEXT: movaps %xmm0, 560(%r8)
2340 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2341 ; SSE-NEXT: movaps %xmm0, 544(%r8)
2342 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2343 ; SSE-NEXT: movaps %xmm0, 528(%r8)
2344 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2345 ; SSE-NEXT: movaps %xmm0, 512(%r8)
2346 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2347 ; SSE-NEXT: movaps %xmm0, 496(%r8)
2348 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2349 ; SSE-NEXT: movaps %xmm0, 480(%r8)
2350 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2351 ; SSE-NEXT: movaps %xmm0, 464(%r8)
2352 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2353 ; SSE-NEXT: movaps %xmm0, 448(%r8)
2354 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2355 ; SSE-NEXT: movaps %xmm0, 432(%r8)
2356 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2357 ; SSE-NEXT: movaps %xmm0, 416(%r8)
2358 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2359 ; SSE-NEXT: movaps %xmm0, 400(%r8)
2360 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2361 ; SSE-NEXT: movaps %xmm0, 384(%r8)
2362 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2363 ; SSE-NEXT: movaps %xmm0, 368(%r8)
2364 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2365 ; SSE-NEXT: movaps %xmm0, 352(%r8)
2366 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2367 ; SSE-NEXT: movaps %xmm0, 336(%r8)
2368 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2369 ; SSE-NEXT: movaps %xmm0, 320(%r8)
2370 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2371 ; SSE-NEXT: movaps %xmm0, 304(%r8)
2372 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2373 ; SSE-NEXT: movaps %xmm0, 288(%r8)
2374 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2375 ; SSE-NEXT: movaps %xmm0, 272(%r8)
2376 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2377 ; SSE-NEXT: movaps %xmm0, 256(%r8)
2378 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2379 ; SSE-NEXT: movaps %xmm0, 240(%r8)
2380 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2381 ; SSE-NEXT: movaps %xmm0, 224(%r8)
2382 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2383 ; SSE-NEXT: movaps %xmm0, 208(%r8)
2384 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2385 ; SSE-NEXT: movaps %xmm0, 192(%r8)
2386 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2387 ; SSE-NEXT: movaps %xmm0, 176(%r8)
2388 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2389 ; SSE-NEXT: movaps %xmm0, 160(%r8)
2390 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2391 ; SSE-NEXT: movaps %xmm0, 144(%r8)
2392 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2393 ; SSE-NEXT: movaps %xmm0, 128(%r8)
2394 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2395 ; SSE-NEXT: movaps %xmm0, 112(%r8)
2396 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2397 ; SSE-NEXT: movaps %xmm0, 96(%r8)
2398 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2399 ; SSE-NEXT: movaps %xmm0, 80(%r8)
2400 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2401 ; SSE-NEXT: movaps %xmm0, 64(%r8)
2402 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2403 ; SSE-NEXT: movaps %xmm0, 48(%r8)
2404 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2405 ; SSE-NEXT: movaps %xmm0, 32(%r8)
2406 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2407 ; SSE-NEXT: movaps %xmm0, 16(%r8)
2408 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2409 ; SSE-NEXT: movaps %xmm0, (%r8)
2410 ; SSE-NEXT: addq $664, %rsp # imm = 0x298
2413 ; AVX-LABEL: store_i64_stride4_vf32:
2415 ; AVX-NEXT: subq $664, %rsp # imm = 0x298
2416 ; AVX-NEXT: vmovaps 160(%rdx), %ymm0
2417 ; AVX-NEXT: vmovaps 128(%rdx), %ymm2
2418 ; AVX-NEXT: vmovaps 96(%rdx), %ymm4
2419 ; AVX-NEXT: vmovaps 64(%rdx), %ymm6
2420 ; AVX-NEXT: vmovaps 32(%rdx), %ymm7
2421 ; AVX-NEXT: vmovaps (%rdx), %ymm8
2422 ; AVX-NEXT: vmovaps 160(%rcx), %ymm1
2423 ; AVX-NEXT: vmovaps 128(%rcx), %ymm3
2424 ; AVX-NEXT: vmovaps 96(%rcx), %ymm5
2425 ; AVX-NEXT: vmovaps 64(%rcx), %ymm9
2426 ; AVX-NEXT: vmovaps 32(%rcx), %ymm10
2427 ; AVX-NEXT: vmovaps (%rcx), %ymm11
2428 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm8[0],ymm11[0],ymm8[2],ymm11[2]
2429 ; AVX-NEXT: vmovaps 16(%rsi), %xmm13
2430 ; AVX-NEXT: vmovaps 16(%rdi), %xmm14
2431 ; AVX-NEXT: vmovlhps {{.*#+}} xmm15 = xmm14[0],xmm13[0]
2432 ; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7]
2433 ; AVX-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2434 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm11[1],ymm8[3],ymm11[3]
2435 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm14[1],xmm13[1]
2436 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7]
2437 ; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2438 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm10[0],ymm7[2],ymm10[2]
2439 ; AVX-NEXT: vmovaps 48(%rsi), %xmm11
2440 ; AVX-NEXT: vmovaps 48(%rdi), %xmm12
2441 ; AVX-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm11[0]
2442 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
2443 ; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2444 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm10[1],ymm7[3],ymm10[3]
2445 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm12[1],xmm11[1]
2446 ; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
2447 ; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2448 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm6[0],ymm9[0],ymm6[2],ymm9[2]
2449 ; AVX-NEXT: vmovaps 80(%rsi), %xmm8
2450 ; AVX-NEXT: vmovaps 80(%rdi), %xmm10
2451 ; AVX-NEXT: vmovlhps {{.*#+}} xmm11 = xmm10[0],xmm8[0]
2452 ; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7]
2453 ; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2454 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3]
2455 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm10[1],xmm8[1]
2456 ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
2457 ; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2458 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
2459 ; AVX-NEXT: vmovaps 112(%rsi), %xmm7
2460 ; AVX-NEXT: vmovaps 112(%rdi), %xmm8
2461 ; AVX-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm7[0]
2462 ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
2463 ; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2464 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
2465 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm8[1],xmm7[1]
2466 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
2467 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2468 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
2469 ; AVX-NEXT: vmovaps 144(%rsi), %xmm5
2470 ; AVX-NEXT: vmovaps 144(%rdi), %xmm6
2471 ; AVX-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm5[0]
2472 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7]
2473 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2474 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
2475 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm5[1]
2476 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
2477 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2478 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
2479 ; AVX-NEXT: vmovaps 176(%rsi), %xmm3
2480 ; AVX-NEXT: vmovaps 176(%rdi), %xmm4
2481 ; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm3[0]
2482 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
2483 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2484 ; AVX-NEXT: vmovaps 192(%rdx), %ymm2
2485 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
2486 ; AVX-NEXT: vmovaps 192(%rcx), %ymm1
2487 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1]
2488 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
2489 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2490 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
2491 ; AVX-NEXT: vmovaps 208(%rsi), %xmm3
2492 ; AVX-NEXT: vmovaps 208(%rdi), %xmm4
2493 ; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm3[0]
2494 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
2495 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2496 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
2497 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm3[1]
2498 ; AVX-NEXT: vmovaps 224(%rdx), %ymm2
2499 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2500 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2501 ; AVX-NEXT: vmovaps 224(%rcx), %ymm0
2502 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
2503 ; AVX-NEXT: vmovaps 240(%rsi), %xmm3
2504 ; AVX-NEXT: vmovaps 240(%rdi), %xmm4
2505 ; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm3[0]
2506 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
2507 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2508 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
2509 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm3[1]
2510 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2511 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2512 ; AVX-NEXT: vmovaps 128(%rsi), %xmm0
2513 ; AVX-NEXT: vmovaps 128(%rdi), %xmm1
2514 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
2515 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2516 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2517 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2518 ; AVX-NEXT: vmovaps 128(%rcx), %xmm0
2519 ; AVX-NEXT: vmovaps 128(%rdx), %xmm1
2520 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
2521 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2522 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2523 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2524 ; AVX-NEXT: vmovaps 64(%rsi), %xmm0
2525 ; AVX-NEXT: vmovaps 64(%rdi), %xmm1
2526 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
2527 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2528 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2529 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2530 ; AVX-NEXT: vmovaps 64(%rcx), %xmm0
2531 ; AVX-NEXT: vmovaps 64(%rdx), %xmm1
2532 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
2533 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2534 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2535 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2536 ; AVX-NEXT: vmovaps 32(%rsi), %xmm0
2537 ; AVX-NEXT: vmovaps 32(%rdi), %xmm1
2538 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
2539 ; AVX-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill
2540 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2541 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2542 ; AVX-NEXT: vmovaps 32(%rcx), %xmm0
2543 ; AVX-NEXT: vmovaps 32(%rdx), %xmm1
2544 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
2545 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2546 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2547 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2548 ; AVX-NEXT: vmovaps 96(%rsi), %xmm0
2549 ; AVX-NEXT: vmovaps 96(%rdi), %xmm1
2550 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
2551 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2552 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2553 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2554 ; AVX-NEXT: vmovaps 96(%rcx), %xmm0
2555 ; AVX-NEXT: vmovaps 96(%rdx), %xmm1
2556 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
2557 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2558 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2559 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2560 ; AVX-NEXT: vmovaps 160(%rsi), %xmm0
2561 ; AVX-NEXT: vmovaps 160(%rdi), %xmm1
2562 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
2563 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2564 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm1[1],xmm0[1]
2565 ; AVX-NEXT: vmovaps 160(%rcx), %xmm0
2566 ; AVX-NEXT: vmovaps 160(%rdx), %xmm1
2567 ; AVX-NEXT: vmovlhps {{.*#+}} xmm14 = xmm1[0],xmm0[0]
2568 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm1[1],xmm0[1]
2569 ; AVX-NEXT: vmovaps 224(%rsi), %xmm1
2570 ; AVX-NEXT: vmovaps 224(%rdi), %xmm0
2571 ; AVX-NEXT: vmovlhps {{.*#+}} xmm12 = xmm0[0],xmm1[0]
2572 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm0[1],xmm1[1]
2573 ; AVX-NEXT: vmovaps 224(%rcx), %xmm1
2574 ; AVX-NEXT: vmovaps 224(%rdx), %xmm0
2575 ; AVX-NEXT: vmovlhps {{.*#+}} xmm10 = xmm0[0],xmm1[0]
2576 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm0[1],xmm1[1]
2577 ; AVX-NEXT: vmovaps 192(%rsi), %xmm1
2578 ; AVX-NEXT: vmovaps 192(%rdi), %xmm0
2579 ; AVX-NEXT: vmovlhps {{.*#+}} xmm8 = xmm0[0],xmm1[0]
2580 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm0[1],xmm1[1]
2581 ; AVX-NEXT: vmovaps 192(%rcx), %xmm1
2582 ; AVX-NEXT: vmovaps 192(%rdx), %xmm0
2583 ; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm0[0],xmm1[0]
2584 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm0[1],xmm1[1]
2585 ; AVX-NEXT: vmovaps (%rsi), %xmm1
2586 ; AVX-NEXT: vmovaps (%rdi), %xmm0
2587 ; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm1[0]
2588 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm0[1],xmm1[1]
2589 ; AVX-NEXT: vmovaps (%rcx), %xmm1
2590 ; AVX-NEXT: vmovaps (%rdx), %xmm0
2591 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
2592 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
2593 ; AVX-NEXT: vmovaps %xmm0, 48(%r8)
2594 ; AVX-NEXT: vmovaps %xmm3, 32(%r8)
2595 ; AVX-NEXT: vmovaps %xmm2, 16(%r8)
2596 ; AVX-NEXT: vmovaps %xmm4, (%r8)
2597 ; AVX-NEXT: vmovaps %xmm5, 816(%r8)
2598 ; AVX-NEXT: vmovaps %xmm7, 800(%r8)
2599 ; AVX-NEXT: vmovaps %xmm6, 784(%r8)
2600 ; AVX-NEXT: vmovaps %xmm8, 768(%r8)
2601 ; AVX-NEXT: vmovaps %xmm9, 944(%r8)
2602 ; AVX-NEXT: vmovaps %xmm11, 928(%r8)
2603 ; AVX-NEXT: vmovaps %xmm10, 912(%r8)
2604 ; AVX-NEXT: vmovaps %xmm12, 896(%r8)
2605 ; AVX-NEXT: vmovaps %xmm13, 688(%r8)
2606 ; AVX-NEXT: vmovaps %xmm15, 672(%r8)
2607 ; AVX-NEXT: vmovaps %xmm14, 656(%r8)
2608 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2609 ; AVX-NEXT: vmovaps %xmm0, 640(%r8)
2610 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2611 ; AVX-NEXT: vmovaps %xmm0, 432(%r8)
2612 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2613 ; AVX-NEXT: vmovaps %xmm0, 416(%r8)
2614 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2615 ; AVX-NEXT: vmovaps %xmm0, 400(%r8)
2616 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2617 ; AVX-NEXT: vmovaps %xmm0, 384(%r8)
2618 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2619 ; AVX-NEXT: vmovaps %xmm0, 176(%r8)
2620 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2621 ; AVX-NEXT: vmovaps %xmm0, 160(%r8)
2622 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2623 ; AVX-NEXT: vmovaps %xmm0, 144(%r8)
2624 ; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
2625 ; AVX-NEXT: vmovaps %xmm0, 128(%r8)
2626 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2627 ; AVX-NEXT: vmovaps %xmm0, 304(%r8)
2628 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2629 ; AVX-NEXT: vmovaps %xmm0, 288(%r8)
2630 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2631 ; AVX-NEXT: vmovaps %xmm0, 272(%r8)
2632 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2633 ; AVX-NEXT: vmovaps %xmm0, 256(%r8)
2634 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2635 ; AVX-NEXT: vmovaps %xmm0, 560(%r8)
2636 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2637 ; AVX-NEXT: vmovaps %xmm0, 544(%r8)
2638 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2639 ; AVX-NEXT: vmovaps %xmm0, 528(%r8)
2640 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2641 ; AVX-NEXT: vmovaps %xmm0, 512(%r8)
2642 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2643 ; AVX-NEXT: vmovaps %ymm0, 992(%r8)
2644 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2645 ; AVX-NEXT: vmovaps %ymm0, 960(%r8)
2646 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2647 ; AVX-NEXT: vmovaps %ymm0, 864(%r8)
2648 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2649 ; AVX-NEXT: vmovaps %ymm0, 832(%r8)
2650 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2651 ; AVX-NEXT: vmovaps %ymm0, 736(%r8)
2652 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2653 ; AVX-NEXT: vmovaps %ymm0, 704(%r8)
2654 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2655 ; AVX-NEXT: vmovaps %ymm0, 608(%r8)
2656 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2657 ; AVX-NEXT: vmovaps %ymm0, 576(%r8)
2658 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2659 ; AVX-NEXT: vmovaps %ymm0, 480(%r8)
2660 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2661 ; AVX-NEXT: vmovaps %ymm0, 448(%r8)
2662 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2663 ; AVX-NEXT: vmovaps %ymm0, 352(%r8)
2664 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2665 ; AVX-NEXT: vmovaps %ymm0, 320(%r8)
2666 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2667 ; AVX-NEXT: vmovaps %ymm0, 224(%r8)
2668 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2669 ; AVX-NEXT: vmovaps %ymm0, 192(%r8)
2670 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2671 ; AVX-NEXT: vmovaps %ymm0, 96(%r8)
2672 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2673 ; AVX-NEXT: vmovaps %ymm0, 64(%r8)
2674 ; AVX-NEXT: addq $664, %rsp # imm = 0x298
2675 ; AVX-NEXT: vzeroupper
2678 ; AVX2-LABEL: store_i64_stride4_vf32:
2680 ; AVX2-NEXT: subq $520, %rsp # imm = 0x208
2681 ; AVX2-NEXT: vmovaps (%rsi), %xmm0
2682 ; AVX2-NEXT: vmovaps 32(%rsi), %xmm1
2683 ; AVX2-NEXT: vmovaps 64(%rsi), %xmm2
2684 ; AVX2-NEXT: vinsertf128 $1, (%rcx), %ymm0, %ymm0
2685 ; AVX2-NEXT: vmovaps (%rdi), %xmm3
2686 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm4
2687 ; AVX2-NEXT: vmovaps 64(%rdi), %xmm5
2688 ; AVX2-NEXT: vinsertf128 $1, (%rdx), %ymm3, %ymm3
2689 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
2690 ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2691 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3]
2692 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2693 ; AVX2-NEXT: vinsertf128 $1, 32(%rcx), %ymm1, %ymm0
2694 ; AVX2-NEXT: vinsertf128 $1, 32(%rdx), %ymm4, %ymm1
2695 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
2696 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2697 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
2698 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2699 ; AVX2-NEXT: vinsertf128 $1, 64(%rcx), %ymm2, %ymm0
2700 ; AVX2-NEXT: vinsertf128 $1, 64(%rdx), %ymm5, %ymm1
2701 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
2702 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2703 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
2704 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2705 ; AVX2-NEXT: vmovaps 96(%rsi), %xmm0
2706 ; AVX2-NEXT: vinsertf128 $1, 96(%rcx), %ymm0, %ymm0
2707 ; AVX2-NEXT: vmovaps 96(%rdi), %xmm1
2708 ; AVX2-NEXT: vinsertf128 $1, 96(%rdx), %ymm1, %ymm1
2709 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
2710 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2711 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
2712 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2713 ; AVX2-NEXT: vmovaps 128(%rsi), %xmm0
2714 ; AVX2-NEXT: vinsertf128 $1, 128(%rcx), %ymm0, %ymm0
2715 ; AVX2-NEXT: vmovaps 128(%rdi), %xmm1
2716 ; AVX2-NEXT: vinsertf128 $1, 128(%rdx), %ymm1, %ymm1
2717 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
2718 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2719 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
2720 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2721 ; AVX2-NEXT: vmovaps 160(%rsi), %xmm0
2722 ; AVX2-NEXT: vinsertf128 $1, 160(%rcx), %ymm0, %ymm0
2723 ; AVX2-NEXT: vmovaps 160(%rdi), %xmm1
2724 ; AVX2-NEXT: vinsertf128 $1, 160(%rdx), %ymm1, %ymm1
2725 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
2726 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2727 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
2728 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2729 ; AVX2-NEXT: vmovaps 192(%rsi), %xmm0
2730 ; AVX2-NEXT: vinsertf128 $1, 192(%rcx), %ymm0, %ymm0
2731 ; AVX2-NEXT: vmovaps 192(%rdi), %xmm1
2732 ; AVX2-NEXT: vinsertf128 $1, 192(%rdx), %ymm1, %ymm1
2733 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
2734 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2735 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
2736 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2737 ; AVX2-NEXT: vmovaps 224(%rsi), %xmm0
2738 ; AVX2-NEXT: vinsertf128 $1, 224(%rcx), %ymm0, %ymm0
2739 ; AVX2-NEXT: vmovaps 224(%rdi), %xmm1
2740 ; AVX2-NEXT: vinsertf128 $1, 224(%rdx), %ymm1, %ymm1
2741 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
2742 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2743 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
2744 ; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
2745 ; AVX2-NEXT: vmovaps (%rdi), %ymm0
2746 ; AVX2-NEXT: vmovaps (%rsi), %ymm1
2747 ; AVX2-NEXT: vmovaps (%rdx), %ymm2
2748 ; AVX2-NEXT: vmovaps (%rcx), %ymm3
2749 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
2750 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
2751 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
2752 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2753 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
2754 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
2755 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
2756 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2757 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm0
2758 ; AVX2-NEXT: vmovaps 32(%rsi), %ymm1
2759 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm2
2760 ; AVX2-NEXT: vmovaps 32(%rcx), %ymm3
2761 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
2762 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
2763 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
2764 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2765 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
2766 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
2767 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
2768 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2769 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm0
2770 ; AVX2-NEXT: vmovaps 64(%rsi), %ymm1
2771 ; AVX2-NEXT: vmovaps 64(%rdx), %ymm2
2772 ; AVX2-NEXT: vmovaps 64(%rcx), %ymm3
2773 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
2774 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
2775 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm5[2,3],ymm4[2,3]
2776 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
2777 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
2778 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3],ymm2[2,3]
2779 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm3
2780 ; AVX2-NEXT: vmovaps 96(%rsi), %ymm1
2781 ; AVX2-NEXT: vmovaps 96(%rdx), %ymm2
2782 ; AVX2-NEXT: vmovaps 96(%rcx), %ymm0
2783 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
2784 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
2785 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm11[2,3],ymm4[2,3]
2786 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
2787 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
2788 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm1[2,3],ymm0[2,3]
2789 ; AVX2-NEXT: vmovaps 128(%rdi), %ymm1
2790 ; AVX2-NEXT: vmovaps 128(%rsi), %ymm3
2791 ; AVX2-NEXT: vmovaps 128(%rdx), %ymm11
2792 ; AVX2-NEXT: vmovaps 128(%rcx), %ymm0
2793 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
2794 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
2795 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm13[2,3],ymm2[2,3]
2796 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm0[1],ymm11[3],ymm0[3]
2797 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3]
2798 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3],ymm0[2,3]
2799 ; AVX2-NEXT: vmovaps 160(%rdi), %ymm1
2800 ; AVX2-NEXT: vmovaps 160(%rsi), %ymm3
2801 ; AVX2-NEXT: vmovaps 160(%rdx), %ymm11
2802 ; AVX2-NEXT: vmovaps 160(%rcx), %ymm13
2803 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm13[0],ymm11[2],ymm13[2]
2804 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
2805 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm15[2,3],ymm0[2,3]
2806 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm13[1],ymm11[3],ymm13[3]
2807 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3]
2808 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm11[2,3]
2809 ; AVX2-NEXT: vmovaps 192(%rdi), %ymm3
2810 ; AVX2-NEXT: vmovaps 192(%rsi), %ymm11
2811 ; AVX2-NEXT: vmovaps 192(%rdx), %ymm13
2812 ; AVX2-NEXT: vmovaps 192(%rcx), %ymm15
2813 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm15[0],ymm13[2],ymm15[2]
2814 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm3[0],ymm11[0],ymm3[2],ymm11[2]
2815 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm14[2,3],ymm0[2,3]
2816 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm13[1],ymm15[1],ymm13[3],ymm15[3]
2817 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm11[1],ymm3[3],ymm11[3]
2818 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm13[2,3]
2819 ; AVX2-NEXT: vmovaps 224(%rdi), %ymm11
2820 ; AVX2-NEXT: vmovaps 224(%rsi), %ymm13
2821 ; AVX2-NEXT: vmovaps 224(%rdx), %ymm14
2822 ; AVX2-NEXT: vmovaps 224(%rcx), %ymm15
2823 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2]
2824 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm11[0],ymm13[0],ymm11[2],ymm13[2]
2825 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3]
2826 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm15[1],ymm14[3],ymm15[3]
2827 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm13[1],ymm11[3],ymm13[3]
2828 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm12[2,3]
2829 ; AVX2-NEXT: vmovaps %ymm11, 992(%r8)
2830 ; AVX2-NEXT: vmovaps %ymm0, 960(%r8)
2831 ; AVX2-NEXT: vmovaps %ymm3, 864(%r8)
2832 ; AVX2-NEXT: vmovaps %ymm1, 832(%r8)
2833 ; AVX2-NEXT: vmovaps %ymm2, 736(%r8)
2834 ; AVX2-NEXT: vmovaps %ymm4, 704(%r8)
2835 ; AVX2-NEXT: vmovaps %ymm5, 608(%r8)
2836 ; AVX2-NEXT: vmovaps %ymm6, 576(%r8)
2837 ; AVX2-NEXT: vmovaps %ymm7, 480(%r8)
2838 ; AVX2-NEXT: vmovaps %ymm8, 448(%r8)
2839 ; AVX2-NEXT: vmovaps %ymm9, 352(%r8)
2840 ; AVX2-NEXT: vmovaps %ymm10, 320(%r8)
2841 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2842 ; AVX2-NEXT: vmovaps %ymm0, 224(%r8)
2843 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2844 ; AVX2-NEXT: vmovaps %ymm0, 192(%r8)
2845 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2846 ; AVX2-NEXT: vmovaps %ymm0, 96(%r8)
2847 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2848 ; AVX2-NEXT: vmovaps %ymm0, 64(%r8)
2849 ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2850 ; AVX2-NEXT: vmovaps %ymm0, 928(%r8)
2851 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2852 ; AVX2-NEXT: vmovaps %ymm0, 896(%r8)
2853 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2854 ; AVX2-NEXT: vmovaps %ymm0, 800(%r8)
2855 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2856 ; AVX2-NEXT: vmovaps %ymm0, 768(%r8)
2857 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2858 ; AVX2-NEXT: vmovaps %ymm0, 672(%r8)
2859 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2860 ; AVX2-NEXT: vmovaps %ymm0, 640(%r8)
2861 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2862 ; AVX2-NEXT: vmovaps %ymm0, 544(%r8)
2863 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2864 ; AVX2-NEXT: vmovaps %ymm0, 512(%r8)
2865 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2866 ; AVX2-NEXT: vmovaps %ymm0, 416(%r8)
2867 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2868 ; AVX2-NEXT: vmovaps %ymm0, 384(%r8)
2869 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2870 ; AVX2-NEXT: vmovaps %ymm0, 288(%r8)
2871 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2872 ; AVX2-NEXT: vmovaps %ymm0, 256(%r8)
2873 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2874 ; AVX2-NEXT: vmovaps %ymm0, 160(%r8)
2875 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2876 ; AVX2-NEXT: vmovaps %ymm0, 128(%r8)
2877 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2878 ; AVX2-NEXT: vmovaps %ymm0, 32(%r8)
2879 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2880 ; AVX2-NEXT: vmovaps %ymm0, (%r8)
2881 ; AVX2-NEXT: addq $520, %rsp # imm = 0x208
2882 ; AVX2-NEXT: vzeroupper
2885 ; AVX2-FP-LABEL: store_i64_stride4_vf32:
2887 ; AVX2-FP-NEXT: subq $520, %rsp # imm = 0x208
2888 ; AVX2-FP-NEXT: vmovaps (%rsi), %xmm0
2889 ; AVX2-FP-NEXT: vmovaps 32(%rsi), %xmm1
2890 ; AVX2-FP-NEXT: vmovaps 64(%rsi), %xmm2
2891 ; AVX2-FP-NEXT: vinsertf128 $1, (%rcx), %ymm0, %ymm0
2892 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm3
2893 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm4
2894 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm5
2895 ; AVX2-FP-NEXT: vinsertf128 $1, (%rdx), %ymm3, %ymm3
2896 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
2897 ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2898 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3]
2899 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2900 ; AVX2-FP-NEXT: vinsertf128 $1, 32(%rcx), %ymm1, %ymm0
2901 ; AVX2-FP-NEXT: vinsertf128 $1, 32(%rdx), %ymm4, %ymm1
2902 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
2903 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2904 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
2905 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2906 ; AVX2-FP-NEXT: vinsertf128 $1, 64(%rcx), %ymm2, %ymm0
2907 ; AVX2-FP-NEXT: vinsertf128 $1, 64(%rdx), %ymm5, %ymm1
2908 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
2909 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2910 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
2911 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2912 ; AVX2-FP-NEXT: vmovaps 96(%rsi), %xmm0
2913 ; AVX2-FP-NEXT: vinsertf128 $1, 96(%rcx), %ymm0, %ymm0
2914 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm1
2915 ; AVX2-FP-NEXT: vinsertf128 $1, 96(%rdx), %ymm1, %ymm1
2916 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
2917 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2918 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
2919 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2920 ; AVX2-FP-NEXT: vmovaps 128(%rsi), %xmm0
2921 ; AVX2-FP-NEXT: vinsertf128 $1, 128(%rcx), %ymm0, %ymm0
2922 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %xmm1
2923 ; AVX2-FP-NEXT: vinsertf128 $1, 128(%rdx), %ymm1, %ymm1
2924 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
2925 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2926 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
2927 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2928 ; AVX2-FP-NEXT: vmovaps 160(%rsi), %xmm0
2929 ; AVX2-FP-NEXT: vinsertf128 $1, 160(%rcx), %ymm0, %ymm0
2930 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %xmm1
2931 ; AVX2-FP-NEXT: vinsertf128 $1, 160(%rdx), %ymm1, %ymm1
2932 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
2933 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2934 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
2935 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2936 ; AVX2-FP-NEXT: vmovaps 192(%rsi), %xmm0
2937 ; AVX2-FP-NEXT: vinsertf128 $1, 192(%rcx), %ymm0, %ymm0
2938 ; AVX2-FP-NEXT: vmovaps 192(%rdi), %xmm1
2939 ; AVX2-FP-NEXT: vinsertf128 $1, 192(%rdx), %ymm1, %ymm1
2940 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
2941 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2942 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
2943 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2944 ; AVX2-FP-NEXT: vmovaps 224(%rsi), %xmm0
2945 ; AVX2-FP-NEXT: vinsertf128 $1, 224(%rcx), %ymm0, %ymm0
2946 ; AVX2-FP-NEXT: vmovaps 224(%rdi), %xmm1
2947 ; AVX2-FP-NEXT: vinsertf128 $1, 224(%rdx), %ymm1, %ymm1
2948 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
2949 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2950 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
2951 ; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
2952 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0
2953 ; AVX2-FP-NEXT: vmovaps (%rsi), %ymm1
2954 ; AVX2-FP-NEXT: vmovaps (%rdx), %ymm2
2955 ; AVX2-FP-NEXT: vmovaps (%rcx), %ymm3
2956 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
2957 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
2958 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
2959 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2960 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
2961 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
2962 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
2963 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2964 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm0
2965 ; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm1
2966 ; AVX2-FP-NEXT: vmovaps 32(%rdx), %ymm2
2967 ; AVX2-FP-NEXT: vmovaps 32(%rcx), %ymm3
2968 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
2969 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
2970 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
2971 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2972 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
2973 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
2974 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
2975 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2976 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm0
2977 ; AVX2-FP-NEXT: vmovaps 64(%rsi), %ymm1
2978 ; AVX2-FP-NEXT: vmovaps 64(%rdx), %ymm2
2979 ; AVX2-FP-NEXT: vmovaps 64(%rcx), %ymm3
2980 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
2981 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
2982 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm5[2,3],ymm4[2,3]
2983 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
2984 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
2985 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3],ymm2[2,3]
2986 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm3
2987 ; AVX2-FP-NEXT: vmovaps 96(%rsi), %ymm1
2988 ; AVX2-FP-NEXT: vmovaps 96(%rdx), %ymm2
2989 ; AVX2-FP-NEXT: vmovaps 96(%rcx), %ymm0
2990 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
2991 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
2992 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm11[2,3],ymm4[2,3]
2993 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
2994 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
2995 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm1[2,3],ymm0[2,3]
2996 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm1
2997 ; AVX2-FP-NEXT: vmovaps 128(%rsi), %ymm3
2998 ; AVX2-FP-NEXT: vmovaps 128(%rdx), %ymm11
2999 ; AVX2-FP-NEXT: vmovaps 128(%rcx), %ymm0
3000 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
3001 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
3002 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm13[2,3],ymm2[2,3]
3003 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm0[1],ymm11[3],ymm0[3]
3004 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3]
3005 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3],ymm0[2,3]
3006 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm1
3007 ; AVX2-FP-NEXT: vmovaps 160(%rsi), %ymm3
3008 ; AVX2-FP-NEXT: vmovaps 160(%rdx), %ymm11
3009 ; AVX2-FP-NEXT: vmovaps 160(%rcx), %ymm13
3010 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm13[0],ymm11[2],ymm13[2]
3011 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
3012 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm15[2,3],ymm0[2,3]
3013 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm13[1],ymm11[3],ymm13[3]
3014 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3]
3015 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm11[2,3]
3016 ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm3
3017 ; AVX2-FP-NEXT: vmovaps 192(%rsi), %ymm11
3018 ; AVX2-FP-NEXT: vmovaps 192(%rdx), %ymm13
3019 ; AVX2-FP-NEXT: vmovaps 192(%rcx), %ymm15
3020 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm15[0],ymm13[2],ymm15[2]
3021 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm3[0],ymm11[0],ymm3[2],ymm11[2]
3022 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm14[2,3],ymm0[2,3]
3023 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm13[1],ymm15[1],ymm13[3],ymm15[3]
3024 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm11[1],ymm3[3],ymm11[3]
3025 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm13[2,3]
3026 ; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm11
3027 ; AVX2-FP-NEXT: vmovaps 224(%rsi), %ymm13
3028 ; AVX2-FP-NEXT: vmovaps 224(%rdx), %ymm14
3029 ; AVX2-FP-NEXT: vmovaps 224(%rcx), %ymm15
3030 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2]
3031 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm11[0],ymm13[0],ymm11[2],ymm13[2]
3032 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3]
3033 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm15[1],ymm14[3],ymm15[3]
3034 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm13[1],ymm11[3],ymm13[3]
3035 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm12[2,3]
3036 ; AVX2-FP-NEXT: vmovaps %ymm11, 992(%r8)
3037 ; AVX2-FP-NEXT: vmovaps %ymm0, 960(%r8)
3038 ; AVX2-FP-NEXT: vmovaps %ymm3, 864(%r8)
3039 ; AVX2-FP-NEXT: vmovaps %ymm1, 832(%r8)
3040 ; AVX2-FP-NEXT: vmovaps %ymm2, 736(%r8)
3041 ; AVX2-FP-NEXT: vmovaps %ymm4, 704(%r8)
3042 ; AVX2-FP-NEXT: vmovaps %ymm5, 608(%r8)
3043 ; AVX2-FP-NEXT: vmovaps %ymm6, 576(%r8)
3044 ; AVX2-FP-NEXT: vmovaps %ymm7, 480(%r8)
3045 ; AVX2-FP-NEXT: vmovaps %ymm8, 448(%r8)
3046 ; AVX2-FP-NEXT: vmovaps %ymm9, 352(%r8)
3047 ; AVX2-FP-NEXT: vmovaps %ymm10, 320(%r8)
3048 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3049 ; AVX2-FP-NEXT: vmovaps %ymm0, 224(%r8)
3050 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3051 ; AVX2-FP-NEXT: vmovaps %ymm0, 192(%r8)
3052 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3053 ; AVX2-FP-NEXT: vmovaps %ymm0, 96(%r8)
3054 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3055 ; AVX2-FP-NEXT: vmovaps %ymm0, 64(%r8)
3056 ; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
3057 ; AVX2-FP-NEXT: vmovaps %ymm0, 928(%r8)
3058 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3059 ; AVX2-FP-NEXT: vmovaps %ymm0, 896(%r8)
3060 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3061 ; AVX2-FP-NEXT: vmovaps %ymm0, 800(%r8)
3062 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3063 ; AVX2-FP-NEXT: vmovaps %ymm0, 768(%r8)
3064 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3065 ; AVX2-FP-NEXT: vmovaps %ymm0, 672(%r8)
3066 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3067 ; AVX2-FP-NEXT: vmovaps %ymm0, 640(%r8)
3068 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3069 ; AVX2-FP-NEXT: vmovaps %ymm0, 544(%r8)
3070 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3071 ; AVX2-FP-NEXT: vmovaps %ymm0, 512(%r8)
3072 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3073 ; AVX2-FP-NEXT: vmovaps %ymm0, 416(%r8)
3074 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3075 ; AVX2-FP-NEXT: vmovaps %ymm0, 384(%r8)
3076 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3077 ; AVX2-FP-NEXT: vmovaps %ymm0, 288(%r8)
3078 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3079 ; AVX2-FP-NEXT: vmovaps %ymm0, 256(%r8)
3080 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3081 ; AVX2-FP-NEXT: vmovaps %ymm0, 160(%r8)
3082 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3083 ; AVX2-FP-NEXT: vmovaps %ymm0, 128(%r8)
3084 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3085 ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%r8)
3086 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3087 ; AVX2-FP-NEXT: vmovaps %ymm0, (%r8)
3088 ; AVX2-FP-NEXT: addq $520, %rsp # imm = 0x208
3089 ; AVX2-FP-NEXT: vzeroupper
3090 ; AVX2-FP-NEXT: retq
3092 ; AVX2-FCP-LABEL: store_i64_stride4_vf32:
3093 ; AVX2-FCP: # %bb.0:
3094 ; AVX2-FCP-NEXT: subq $520, %rsp # imm = 0x208
3095 ; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm0
3096 ; AVX2-FCP-NEXT: vmovaps 32(%rsi), %xmm1
3097 ; AVX2-FCP-NEXT: vmovaps 64(%rsi), %xmm2
3098 ; AVX2-FCP-NEXT: vinsertf128 $1, (%rcx), %ymm0, %ymm0
3099 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm3
3100 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm4
3101 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm5
3102 ; AVX2-FCP-NEXT: vinsertf128 $1, (%rdx), %ymm3, %ymm3
3103 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
3104 ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3105 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3]
3106 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3107 ; AVX2-FCP-NEXT: vinsertf128 $1, 32(%rcx), %ymm1, %ymm0
3108 ; AVX2-FCP-NEXT: vinsertf128 $1, 32(%rdx), %ymm4, %ymm1
3109 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
3110 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3111 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
3112 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3113 ; AVX2-FCP-NEXT: vinsertf128 $1, 64(%rcx), %ymm2, %ymm0
3114 ; AVX2-FCP-NEXT: vinsertf128 $1, 64(%rdx), %ymm5, %ymm1
3115 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
3116 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3117 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
3118 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3119 ; AVX2-FCP-NEXT: vmovaps 96(%rsi), %xmm0
3120 ; AVX2-FCP-NEXT: vinsertf128 $1, 96(%rcx), %ymm0, %ymm0
3121 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm1
3122 ; AVX2-FCP-NEXT: vinsertf128 $1, 96(%rdx), %ymm1, %ymm1
3123 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
3124 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3125 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
3126 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3127 ; AVX2-FCP-NEXT: vmovaps 128(%rsi), %xmm0
3128 ; AVX2-FCP-NEXT: vinsertf128 $1, 128(%rcx), %ymm0, %ymm0
3129 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %xmm1
3130 ; AVX2-FCP-NEXT: vinsertf128 $1, 128(%rdx), %ymm1, %ymm1
3131 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
3132 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3133 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
3134 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3135 ; AVX2-FCP-NEXT: vmovaps 160(%rsi), %xmm0
3136 ; AVX2-FCP-NEXT: vinsertf128 $1, 160(%rcx), %ymm0, %ymm0
3137 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %xmm1
3138 ; AVX2-FCP-NEXT: vinsertf128 $1, 160(%rdx), %ymm1, %ymm1
3139 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
3140 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3141 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
3142 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3143 ; AVX2-FCP-NEXT: vmovaps 192(%rsi), %xmm0
3144 ; AVX2-FCP-NEXT: vinsertf128 $1, 192(%rcx), %ymm0, %ymm0
3145 ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %xmm1
3146 ; AVX2-FCP-NEXT: vinsertf128 $1, 192(%rdx), %ymm1, %ymm1
3147 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
3148 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3149 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
3150 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3151 ; AVX2-FCP-NEXT: vmovaps 224(%rsi), %xmm0
3152 ; AVX2-FCP-NEXT: vinsertf128 $1, 224(%rcx), %ymm0, %ymm0
3153 ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %xmm1
3154 ; AVX2-FCP-NEXT: vinsertf128 $1, 224(%rdx), %ymm1, %ymm1
3155 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
3156 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3157 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
3158 ; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
3159 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0
3160 ; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm1
3161 ; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm2
3162 ; AVX2-FCP-NEXT: vmovaps (%rcx), %ymm3
3163 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
3164 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
3165 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
3166 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3167 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
3168 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
3169 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
3170 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3171 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm0
3172 ; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm1
3173 ; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm2
3174 ; AVX2-FCP-NEXT: vmovaps 32(%rcx), %ymm3
3175 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
3176 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
3177 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
3178 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3179 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
3180 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
3181 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
3182 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3183 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm0
3184 ; AVX2-FCP-NEXT: vmovaps 64(%rsi), %ymm1
3185 ; AVX2-FCP-NEXT: vmovaps 64(%rdx), %ymm2
3186 ; AVX2-FCP-NEXT: vmovaps 64(%rcx), %ymm3
3187 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
3188 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
3189 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm5[2,3],ymm4[2,3]
3190 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
3191 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
3192 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3],ymm2[2,3]
3193 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm3
3194 ; AVX2-FCP-NEXT: vmovaps 96(%rsi), %ymm1
3195 ; AVX2-FCP-NEXT: vmovaps 96(%rdx), %ymm2
3196 ; AVX2-FCP-NEXT: vmovaps 96(%rcx), %ymm0
3197 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
3198 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
3199 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm11[2,3],ymm4[2,3]
3200 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
3201 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
3202 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm1[2,3],ymm0[2,3]
3203 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm1
3204 ; AVX2-FCP-NEXT: vmovaps 128(%rsi), %ymm3
3205 ; AVX2-FCP-NEXT: vmovaps 128(%rdx), %ymm11
3206 ; AVX2-FCP-NEXT: vmovaps 128(%rcx), %ymm0
3207 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
3208 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
3209 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm13[2,3],ymm2[2,3]
3210 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm0[1],ymm11[3],ymm0[3]
3211 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3]
3212 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3],ymm0[2,3]
3213 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm1
3214 ; AVX2-FCP-NEXT: vmovaps 160(%rsi), %ymm3
3215 ; AVX2-FCP-NEXT: vmovaps 160(%rdx), %ymm11
3216 ; AVX2-FCP-NEXT: vmovaps 160(%rcx), %ymm13
3217 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm13[0],ymm11[2],ymm13[2]
3218 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
3219 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm15[2,3],ymm0[2,3]
3220 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm13[1],ymm11[3],ymm13[3]
3221 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3]
3222 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm11[2,3]
3223 ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm3
3224 ; AVX2-FCP-NEXT: vmovaps 192(%rsi), %ymm11
3225 ; AVX2-FCP-NEXT: vmovaps 192(%rdx), %ymm13
3226 ; AVX2-FCP-NEXT: vmovaps 192(%rcx), %ymm15
3227 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm15[0],ymm13[2],ymm15[2]
3228 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm3[0],ymm11[0],ymm3[2],ymm11[2]
3229 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm14[2,3],ymm0[2,3]
3230 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm13[1],ymm15[1],ymm13[3],ymm15[3]
3231 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm11[1],ymm3[3],ymm11[3]
3232 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm13[2,3]
3233 ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm11
3234 ; AVX2-FCP-NEXT: vmovaps 224(%rsi), %ymm13
3235 ; AVX2-FCP-NEXT: vmovaps 224(%rdx), %ymm14
3236 ; AVX2-FCP-NEXT: vmovaps 224(%rcx), %ymm15
3237 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2]
3238 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm11[0],ymm13[0],ymm11[2],ymm13[2]
3239 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3]
3240 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm15[1],ymm14[3],ymm15[3]
3241 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm13[1],ymm11[3],ymm13[3]
3242 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm12[2,3]
3243 ; AVX2-FCP-NEXT: vmovaps %ymm11, 992(%r8)
3244 ; AVX2-FCP-NEXT: vmovaps %ymm0, 960(%r8)
3245 ; AVX2-FCP-NEXT: vmovaps %ymm3, 864(%r8)
3246 ; AVX2-FCP-NEXT: vmovaps %ymm1, 832(%r8)
3247 ; AVX2-FCP-NEXT: vmovaps %ymm2, 736(%r8)
3248 ; AVX2-FCP-NEXT: vmovaps %ymm4, 704(%r8)
3249 ; AVX2-FCP-NEXT: vmovaps %ymm5, 608(%r8)
3250 ; AVX2-FCP-NEXT: vmovaps %ymm6, 576(%r8)
3251 ; AVX2-FCP-NEXT: vmovaps %ymm7, 480(%r8)
3252 ; AVX2-FCP-NEXT: vmovaps %ymm8, 448(%r8)
3253 ; AVX2-FCP-NEXT: vmovaps %ymm9, 352(%r8)
3254 ; AVX2-FCP-NEXT: vmovaps %ymm10, 320(%r8)
3255 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3256 ; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%r8)
3257 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3258 ; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%r8)
3259 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3260 ; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%r8)
3261 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3262 ; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%r8)
3263 ; AVX2-FCP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
3264 ; AVX2-FCP-NEXT: vmovaps %ymm0, 928(%r8)
3265 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3266 ; AVX2-FCP-NEXT: vmovaps %ymm0, 896(%r8)
3267 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3268 ; AVX2-FCP-NEXT: vmovaps %ymm0, 800(%r8)
3269 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3270 ; AVX2-FCP-NEXT: vmovaps %ymm0, 768(%r8)
3271 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3272 ; AVX2-FCP-NEXT: vmovaps %ymm0, 672(%r8)
3273 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3274 ; AVX2-FCP-NEXT: vmovaps %ymm0, 640(%r8)
3275 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3276 ; AVX2-FCP-NEXT: vmovaps %ymm0, 544(%r8)
3277 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3278 ; AVX2-FCP-NEXT: vmovaps %ymm0, 512(%r8)
3279 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3280 ; AVX2-FCP-NEXT: vmovaps %ymm0, 416(%r8)
3281 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3282 ; AVX2-FCP-NEXT: vmovaps %ymm0, 384(%r8)
3283 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3284 ; AVX2-FCP-NEXT: vmovaps %ymm0, 288(%r8)
3285 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3286 ; AVX2-FCP-NEXT: vmovaps %ymm0, 256(%r8)
3287 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3288 ; AVX2-FCP-NEXT: vmovaps %ymm0, 160(%r8)
3289 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3290 ; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%r8)
3291 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3292 ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%r8)
3293 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3294 ; AVX2-FCP-NEXT: vmovaps %ymm0, (%r8)
3295 ; AVX2-FCP-NEXT: addq $520, %rsp # imm = 0x208
3296 ; AVX2-FCP-NEXT: vzeroupper
3297 ; AVX2-FCP-NEXT: retq
3299 ; AVX512-LABEL: store_i64_stride4_vf32:
3301 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
3302 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1
3303 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2
3304 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3
3305 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm17
3306 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm23
3307 ; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm12
3308 ; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm5
3309 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm22
3310 ; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm25
3311 ; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm13
3312 ; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm6
3313 ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm21
3314 ; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm26
3315 ; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm19
3316 ; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm9
3317 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,2,10,0,0,3,11]
3318 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm8
3319 ; AVX512-NEXT: vpermt2q %zmm21, %zmm14, %zmm8
3320 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [2,10,0,0,3,11,0,0]
3321 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4
3322 ; AVX512-NEXT: vpermt2q %zmm17, %zmm7, %zmm4
3323 ; AVX512-NEXT: movb $-52, %al
3324 ; AVX512-NEXT: kmovw %eax, %k1
3325 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1}
3326 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,8,0,0,1,9]
3327 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm10
3328 ; AVX512-NEXT: vpermt2q %zmm21, %zmm16, %zmm10
3329 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,8,0,0,1,9,0,0]
3330 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8
3331 ; AVX512-NEXT: vpermt2q %zmm17, %zmm11, %zmm8
3332 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1}
3333 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,6,14,0,0,7,15]
3334 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm20
3335 ; AVX512-NEXT: vpermt2q %zmm21, %zmm18, %zmm20
3336 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm15 = [6,14,0,0,7,15,0,0]
3337 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10
3338 ; AVX512-NEXT: vpermt2q %zmm17, %zmm15, %zmm10
3339 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1}
3340 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,4,12,0,0,5,13]
3341 ; AVX512-NEXT: vpermt2q %zmm21, %zmm20, %zmm22
3342 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [4,12,0,0,5,13,0,0]
3343 ; AVX512-NEXT: vpermt2q %zmm17, %zmm21, %zmm0
3344 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1}
3345 ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm22
3346 ; AVX512-NEXT: vpermt2q %zmm26, %zmm14, %zmm22
3347 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm17
3348 ; AVX512-NEXT: vpermt2q %zmm23, %zmm7, %zmm17
3349 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm17 {%k1}
3350 ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm24
3351 ; AVX512-NEXT: vpermt2q %zmm26, %zmm16, %zmm24
3352 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm22
3353 ; AVX512-NEXT: vpermt2q %zmm23, %zmm11, %zmm22
3354 ; AVX512-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1}
3355 ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm27
3356 ; AVX512-NEXT: vpermt2q %zmm26, %zmm18, %zmm27
3357 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm24
3358 ; AVX512-NEXT: vpermt2q %zmm23, %zmm15, %zmm24
3359 ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm24 {%k1}
3360 ; AVX512-NEXT: vpermt2q %zmm26, %zmm20, %zmm25
3361 ; AVX512-NEXT: vpermt2q %zmm23, %zmm21, %zmm1
3362 ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1}
3363 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm23
3364 ; AVX512-NEXT: vpermt2q %zmm19, %zmm14, %zmm23
3365 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm25
3366 ; AVX512-NEXT: vpermt2q %zmm12, %zmm7, %zmm25
3367 ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm25 {%k1}
3368 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm23
3369 ; AVX512-NEXT: vpermt2q %zmm19, %zmm16, %zmm23
3370 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm26
3371 ; AVX512-NEXT: vpermt2q %zmm12, %zmm11, %zmm26
3372 ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1}
3373 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm23
3374 ; AVX512-NEXT: vpermt2q %zmm19, %zmm18, %zmm23
3375 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm27
3376 ; AVX512-NEXT: vpermt2q %zmm12, %zmm15, %zmm27
3377 ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1}
3378 ; AVX512-NEXT: vpermt2q %zmm19, %zmm20, %zmm13
3379 ; AVX512-NEXT: vpermt2q %zmm12, %zmm21, %zmm2
3380 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm2 {%k1}
3381 ; AVX512-NEXT: vpermi2q %zmm9, %zmm6, %zmm14
3382 ; AVX512-NEXT: vpermi2q %zmm5, %zmm3, %zmm7
3383 ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1}
3384 ; AVX512-NEXT: vpermi2q %zmm9, %zmm6, %zmm16
3385 ; AVX512-NEXT: vpermi2q %zmm5, %zmm3, %zmm11
3386 ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm11 {%k1}
3387 ; AVX512-NEXT: vpermi2q %zmm9, %zmm6, %zmm18
3388 ; AVX512-NEXT: vpermi2q %zmm5, %zmm3, %zmm15
3389 ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1}
3390 ; AVX512-NEXT: vpermt2q %zmm9, %zmm20, %zmm6
3391 ; AVX512-NEXT: vpermt2q %zmm5, %zmm21, %zmm3
3392 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
3393 ; AVX512-NEXT: vmovdqa64 %zmm3, 896(%r8)
3394 ; AVX512-NEXT: vmovdqa64 %zmm15, 960(%r8)
3395 ; AVX512-NEXT: vmovdqa64 %zmm11, 768(%r8)
3396 ; AVX512-NEXT: vmovdqa64 %zmm7, 832(%r8)
3397 ; AVX512-NEXT: vmovdqa64 %zmm2, 640(%r8)
3398 ; AVX512-NEXT: vmovdqa64 %zmm27, 704(%r8)
3399 ; AVX512-NEXT: vmovdqa64 %zmm26, 512(%r8)
3400 ; AVX512-NEXT: vmovdqa64 %zmm25, 576(%r8)
3401 ; AVX512-NEXT: vmovdqa64 %zmm1, 384(%r8)
3402 ; AVX512-NEXT: vmovdqa64 %zmm24, 448(%r8)
3403 ; AVX512-NEXT: vmovdqa64 %zmm22, 256(%r8)
3404 ; AVX512-NEXT: vmovdqa64 %zmm17, 320(%r8)
3405 ; AVX512-NEXT: vmovdqa64 %zmm0, 128(%r8)
3406 ; AVX512-NEXT: vmovdqa64 %zmm10, 192(%r8)
3407 ; AVX512-NEXT: vmovdqa64 %zmm8, (%r8)
3408 ; AVX512-NEXT: vmovdqa64 %zmm4, 64(%r8)
3409 ; AVX512-NEXT: vzeroupper
3412 ; AVX512-FCP-LABEL: store_i64_stride4_vf32:
3413 ; AVX512-FCP: # %bb.0:
3414 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
3415 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
3416 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
3417 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
3418 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm17
3419 ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm23
3420 ; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm12
3421 ; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm5
3422 ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm22
3423 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm25
3424 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %zmm13
3425 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdx), %zmm6
3426 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm21
3427 ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm26
3428 ; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm19
3429 ; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm9
3430 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,2,10,0,0,3,11]
3431 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm8
3432 ; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm14, %zmm8
3433 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [2,10,0,0,3,11,0,0]
3434 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4
3435 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm7, %zmm4
3436 ; AVX512-FCP-NEXT: movb $-52, %al
3437 ; AVX512-FCP-NEXT: kmovw %eax, %k1
3438 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1}
3439 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,8,0,0,1,9]
3440 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm10
3441 ; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm16, %zmm10
3442 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,8,0,0,1,9,0,0]
3443 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8
3444 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm8
3445 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1}
3446 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,6,14,0,0,7,15]
3447 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm20
3448 ; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm18, %zmm20
3449 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [6,14,0,0,7,15,0,0]
3450 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10
3451 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm15, %zmm10
3452 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1}
3453 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,4,12,0,0,5,13]
3454 ; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm20, %zmm22
3455 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [4,12,0,0,5,13,0,0]
3456 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm0
3457 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1}
3458 ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm22
3459 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm14, %zmm22
3460 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm17
3461 ; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm7, %zmm17
3462 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 {%k1}
3463 ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm24
3464 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm16, %zmm24
3465 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm22
3466 ; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm11, %zmm22
3467 ; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1}
3468 ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm27
3469 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm18, %zmm27
3470 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm24
3471 ; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm15, %zmm24
3472 ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm24 {%k1}
3473 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm20, %zmm25
3474 ; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm21, %zmm1
3475 ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1}
3476 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm23
3477 ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm14, %zmm23
3478 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm25
3479 ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm7, %zmm25
3480 ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm25 {%k1}
3481 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm23
3482 ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm16, %zmm23
3483 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm26
3484 ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm11, %zmm26
3485 ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1}
3486 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm23
3487 ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm18, %zmm23
3488 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm27
3489 ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm15, %zmm27
3490 ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1}
3491 ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm20, %zmm13
3492 ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm2
3493 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 {%k1}
3494 ; AVX512-FCP-NEXT: vpermi2q %zmm9, %zmm6, %zmm14
3495 ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm7
3496 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1}
3497 ; AVX512-FCP-NEXT: vpermi2q %zmm9, %zmm6, %zmm16
3498 ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm11
3499 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm11 {%k1}
3500 ; AVX512-FCP-NEXT: vpermi2q %zmm9, %zmm6, %zmm18
3501 ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm15
3502 ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1}
3503 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm6
3504 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm3
3505 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
3506 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 896(%r8)
3507 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 960(%r8)
3508 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 768(%r8)
3509 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 832(%r8)
3510 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 640(%r8)
3511 ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 704(%r8)
3512 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 512(%r8)
3513 ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 576(%r8)
3514 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 384(%r8)
3515 ; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 448(%r8)
3516 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 256(%r8)
3517 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 320(%r8)
3518 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%r8)
3519 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 192(%r8)
3520 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%r8)
3521 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8)
3522 ; AVX512-FCP-NEXT: vzeroupper
3523 ; AVX512-FCP-NEXT: retq
3525 ; AVX512DQ-LABEL: store_i64_stride4_vf32:
3526 ; AVX512DQ: # %bb.0:
3527 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0
3528 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1
3529 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2
3530 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3
3531 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm17
3532 ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm23
3533 ; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm12
3534 ; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm5
3535 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm22
3536 ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm25
3537 ; AVX512DQ-NEXT: vmovdqa64 128(%rdx), %zmm13
3538 ; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm6
3539 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm21
3540 ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm26
3541 ; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm19
3542 ; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm9
3543 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,2,10,0,0,3,11]
3544 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm8
3545 ; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm14, %zmm8
3546 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [2,10,0,0,3,11,0,0]
3547 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4
3548 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm7, %zmm4
3549 ; AVX512DQ-NEXT: movb $-52, %al
3550 ; AVX512DQ-NEXT: kmovw %eax, %k1
3551 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1}
3552 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,8,0,0,1,9]
3553 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm10
3554 ; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm16, %zmm10
3555 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,8,0,0,1,9,0,0]
3556 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8
3557 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm11, %zmm8
3558 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1}
3559 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,6,14,0,0,7,15]
3560 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm20
3561 ; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm18, %zmm20
3562 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm15 = [6,14,0,0,7,15,0,0]
3563 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm10
3564 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm15, %zmm10
3565 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1}
3566 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,4,12,0,0,5,13]
3567 ; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm20, %zmm22
3568 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [4,12,0,0,5,13,0,0]
3569 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm21, %zmm0
3570 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1}
3571 ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm22
3572 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm14, %zmm22
3573 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm17
3574 ; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm7, %zmm17
3575 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm17 {%k1}
3576 ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm24
3577 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm16, %zmm24
3578 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm22
3579 ; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm11, %zmm22
3580 ; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1}
3581 ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm27
3582 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm18, %zmm27
3583 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm24
3584 ; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm15, %zmm24
3585 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm24 {%k1}
3586 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm20, %zmm25
3587 ; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm21, %zmm1
3588 ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1}
3589 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm23
3590 ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm14, %zmm23
3591 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm25
3592 ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm7, %zmm25
3593 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm25 {%k1}
3594 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm23
3595 ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm16, %zmm23
3596 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm26
3597 ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm11, %zmm26
3598 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1}
3599 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm23
3600 ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm18, %zmm23
3601 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm27
3602 ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm15, %zmm27
3603 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1}
3604 ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm20, %zmm13
3605 ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm21, %zmm2
3606 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm2 {%k1}
3607 ; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm6, %zmm14
3608 ; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm3, %zmm7
3609 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1}
3610 ; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm6, %zmm16
3611 ; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm3, %zmm11
3612 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm11 {%k1}
3613 ; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm6, %zmm18
3614 ; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm3, %zmm15
3615 ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1}
3616 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm20, %zmm6
3617 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm21, %zmm3
3618 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
3619 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 896(%r8)
3620 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, 960(%r8)
3621 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 768(%r8)
3622 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 832(%r8)
3623 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 640(%r8)
3624 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, 704(%r8)
3625 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, 512(%r8)
3626 ; AVX512DQ-NEXT: vmovdqa64 %zmm25, 576(%r8)
3627 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 384(%r8)
3628 ; AVX512DQ-NEXT: vmovdqa64 %zmm24, 448(%r8)
3629 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, 256(%r8)
3630 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, 320(%r8)
3631 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%r8)
3632 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 192(%r8)
3633 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%r8)
3634 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%r8)
3635 ; AVX512DQ-NEXT: vzeroupper
3636 ; AVX512DQ-NEXT: retq
3638 ; AVX512DQ-FCP-LABEL: store_i64_stride4_vf32:
3639 ; AVX512DQ-FCP: # %bb.0:
3640 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
3641 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
3642 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
3643 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
3644 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm17
3645 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm23
3646 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm12
3647 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm5
3648 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm22
3649 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm25
3650 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdx), %zmm13
3651 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdx), %zmm6
3652 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm21
3653 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm26
3654 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm19
3655 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm9
3656 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,2,10,0,0,3,11]
3657 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm8
3658 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm14, %zmm8
3659 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [2,10,0,0,3,11,0,0]
3660 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4
3661 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm7, %zmm4
3662 ; AVX512DQ-FCP-NEXT: movb $-52, %al
3663 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
3664 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1}
3665 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,8,0,0,1,9]
3666 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm10
3667 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm16, %zmm10
3668 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,8,0,0,1,9,0,0]
3669 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8
3670 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm8
3671 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1}
3672 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,6,14,0,0,7,15]
3673 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm20
3674 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm18, %zmm20
3675 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [6,14,0,0,7,15,0,0]
3676 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10
3677 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm15, %zmm10
3678 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1}
3679 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,4,12,0,0,5,13]
3680 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm20, %zmm22
3681 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [4,12,0,0,5,13,0,0]
3682 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm0
3683 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1}
3684 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm22
3685 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm14, %zmm22
3686 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm17
3687 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm7, %zmm17
3688 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 {%k1}
3689 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm24
3690 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm16, %zmm24
3691 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm22
3692 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm11, %zmm22
3693 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1}
3694 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm27
3695 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm18, %zmm27
3696 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm24
3697 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm15, %zmm24
3698 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm24 {%k1}
3699 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm20, %zmm25
3700 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm21, %zmm1
3701 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1}
3702 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm23
3703 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm14, %zmm23
3704 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm25
3705 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm7, %zmm25
3706 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm25 {%k1}
3707 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm23
3708 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm16, %zmm23
3709 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm26
3710 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm11, %zmm26
3711 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1}
3712 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm23
3713 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm18, %zmm23
3714 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm27
3715 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm15, %zmm27
3716 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1}
3717 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm20, %zmm13
3718 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm2
3719 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 {%k1}
3720 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm6, %zmm14
3721 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm7
3722 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1}
3723 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm6, %zmm16
3724 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm11
3725 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm11 {%k1}
3726 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm6, %zmm18
3727 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm15
3728 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1}
3729 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm6
3730 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm3
3731 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
3732 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 896(%r8)
3733 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 960(%r8)
3734 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 768(%r8)
3735 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 832(%r8)
3736 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 640(%r8)
3737 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 704(%r8)
3738 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 512(%r8)
3739 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 576(%r8)
3740 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 384(%r8)
3741 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 448(%r8)
3742 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 256(%r8)
3743 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 320(%r8)
3744 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%r8)
3745 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 192(%r8)
3746 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%r8)
3747 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8)
3748 ; AVX512DQ-FCP-NEXT: vzeroupper
3749 ; AVX512DQ-FCP-NEXT: retq
3751 ; AVX512BW-LABEL: store_i64_stride4_vf32:
3752 ; AVX512BW: # %bb.0:
3753 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
3754 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
3755 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2
3756 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3
3757 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm17
3758 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm23
3759 ; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm12
3760 ; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm5
3761 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm22
3762 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm25
3763 ; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm13
3764 ; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm6
3765 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm21
3766 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm26
3767 ; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm19
3768 ; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm9
3769 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,2,10,0,0,3,11]
3770 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm8
3771 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm14, %zmm8
3772 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [2,10,0,0,3,11,0,0]
3773 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4
3774 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm7, %zmm4
3775 ; AVX512BW-NEXT: movb $-52, %al
3776 ; AVX512BW-NEXT: kmovd %eax, %k1
3777 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1}
3778 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,8,0,0,1,9]
3779 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm10
3780 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm16, %zmm10
3781 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,8,0,0,1,9,0,0]
3782 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8
3783 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm8
3784 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1}
3785 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,6,14,0,0,7,15]
3786 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm20
3787 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm18, %zmm20
3788 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [6,14,0,0,7,15,0,0]
3789 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10
3790 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm15, %zmm10
3791 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1}
3792 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,4,12,0,0,5,13]
3793 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm20, %zmm22
3794 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [4,12,0,0,5,13,0,0]
3795 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm21, %zmm0
3796 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1}
3797 ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm22
3798 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm14, %zmm22
3799 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17
3800 ; AVX512BW-NEXT: vpermt2q %zmm23, %zmm7, %zmm17
3801 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm17 {%k1}
3802 ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm24
3803 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm16, %zmm24
3804 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm22
3805 ; AVX512BW-NEXT: vpermt2q %zmm23, %zmm11, %zmm22
3806 ; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1}
3807 ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm27
3808 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm18, %zmm27
3809 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm24
3810 ; AVX512BW-NEXT: vpermt2q %zmm23, %zmm15, %zmm24
3811 ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm24 {%k1}
3812 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm20, %zmm25
3813 ; AVX512BW-NEXT: vpermt2q %zmm23, %zmm21, %zmm1
3814 ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1}
3815 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm23
3816 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm14, %zmm23
3817 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm25
3818 ; AVX512BW-NEXT: vpermt2q %zmm12, %zmm7, %zmm25
3819 ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm25 {%k1}
3820 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm23
3821 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm16, %zmm23
3822 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm26
3823 ; AVX512BW-NEXT: vpermt2q %zmm12, %zmm11, %zmm26
3824 ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1}
3825 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm23
3826 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm18, %zmm23
3827 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm27
3828 ; AVX512BW-NEXT: vpermt2q %zmm12, %zmm15, %zmm27
3829 ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1}
3830 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm20, %zmm13
3831 ; AVX512BW-NEXT: vpermt2q %zmm12, %zmm21, %zmm2
3832 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 {%k1}
3833 ; AVX512BW-NEXT: vpermi2q %zmm9, %zmm6, %zmm14
3834 ; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm7
3835 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1}
3836 ; AVX512BW-NEXT: vpermi2q %zmm9, %zmm6, %zmm16
3837 ; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm11
3838 ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm11 {%k1}
3839 ; AVX512BW-NEXT: vpermi2q %zmm9, %zmm6, %zmm18
3840 ; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm15
3841 ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1}
3842 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm20, %zmm6
3843 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm3
3844 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
3845 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 896(%r8)
3846 ; AVX512BW-NEXT: vmovdqa64 %zmm15, 960(%r8)
3847 ; AVX512BW-NEXT: vmovdqa64 %zmm11, 768(%r8)
3848 ; AVX512BW-NEXT: vmovdqa64 %zmm7, 832(%r8)
3849 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 640(%r8)
3850 ; AVX512BW-NEXT: vmovdqa64 %zmm27, 704(%r8)
3851 ; AVX512BW-NEXT: vmovdqa64 %zmm26, 512(%r8)
3852 ; AVX512BW-NEXT: vmovdqa64 %zmm25, 576(%r8)
3853 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 384(%r8)
3854 ; AVX512BW-NEXT: vmovdqa64 %zmm24, 448(%r8)
3855 ; AVX512BW-NEXT: vmovdqa64 %zmm22, 256(%r8)
3856 ; AVX512BW-NEXT: vmovdqa64 %zmm17, 320(%r8)
3857 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%r8)
3858 ; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%r8)
3859 ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r8)
3860 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%r8)
3861 ; AVX512BW-NEXT: vzeroupper
3862 ; AVX512BW-NEXT: retq
3864 ; AVX512BW-FCP-LABEL: store_i64_stride4_vf32:
3865 ; AVX512BW-FCP: # %bb.0:
3866 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
3867 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
3868 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
3869 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
3870 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm17
3871 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm23
3872 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm12
3873 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm5
3874 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm22
3875 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm25
3876 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm13
3877 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm6
3878 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm21
3879 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm26
3880 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm19
3881 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm9
3882 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,2,10,0,0,3,11]
3883 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm8
3884 ; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm14, %zmm8
3885 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [2,10,0,0,3,11,0,0]
3886 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4
3887 ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm7, %zmm4
3888 ; AVX512BW-FCP-NEXT: movb $-52, %al
3889 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1
3890 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1}
3891 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,8,0,0,1,9]
3892 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm10
3893 ; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm16, %zmm10
3894 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,8,0,0,1,9,0,0]
3895 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8
3896 ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm8
3897 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1}
3898 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,6,14,0,0,7,15]
3899 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm20
3900 ; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm18, %zmm20
3901 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [6,14,0,0,7,15,0,0]
3902 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10
3903 ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm15, %zmm10
3904 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1}
3905 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,4,12,0,0,5,13]
3906 ; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm20, %zmm22
3907 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [4,12,0,0,5,13,0,0]
3908 ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm0
3909 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1}
3910 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm22
3911 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm14, %zmm22
3912 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17
3913 ; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm7, %zmm17
3914 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 {%k1}
3915 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm24
3916 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm16, %zmm24
3917 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm22
3918 ; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm11, %zmm22
3919 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1}
3920 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm27
3921 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm18, %zmm27
3922 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm24
3923 ; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm15, %zmm24
3924 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm24 {%k1}
3925 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm20, %zmm25
3926 ; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm21, %zmm1
3927 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1}
3928 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm23
3929 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm14, %zmm23
3930 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm25
3931 ; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm7, %zmm25
3932 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm25 {%k1}
3933 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm23
3934 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm16, %zmm23
3935 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm26
3936 ; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm11, %zmm26
3937 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1}
3938 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm23
3939 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm18, %zmm23
3940 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm27
3941 ; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm15, %zmm27
3942 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1}
3943 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm20, %zmm13
3944 ; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm2
3945 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 {%k1}
3946 ; AVX512BW-FCP-NEXT: vpermi2q %zmm9, %zmm6, %zmm14
3947 ; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm7
3948 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1}
3949 ; AVX512BW-FCP-NEXT: vpermi2q %zmm9, %zmm6, %zmm16
3950 ; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm11
3951 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm11 {%k1}
3952 ; AVX512BW-FCP-NEXT: vpermi2q %zmm9, %zmm6, %zmm18
3953 ; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm15
3954 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1}
3955 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm6
3956 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm3
3957 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
3958 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 896(%r8)
3959 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 960(%r8)
3960 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 768(%r8)
3961 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 832(%r8)
3962 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 640(%r8)
3963 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 704(%r8)
3964 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 512(%r8)
3965 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 576(%r8)
3966 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 384(%r8)
3967 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 448(%r8)
3968 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 256(%r8)
3969 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 320(%r8)
3970 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%r8)
3971 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%r8)
3972 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%r8)
3973 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8)
3974 ; AVX512BW-FCP-NEXT: vzeroupper
3975 ; AVX512BW-FCP-NEXT: retq
3977 ; AVX512DQ-BW-LABEL: store_i64_stride4_vf32:
3978 ; AVX512DQ-BW: # %bb.0:
3979 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
3980 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1
3981 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2
3982 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3
3983 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm17
3984 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm23
3985 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm12
3986 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm5
3987 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm22
3988 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm25
3989 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdx), %zmm13
3990 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm6
3991 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm21
3992 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm26
3993 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm19
3994 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm9
3995 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,2,10,0,0,3,11]
3996 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm8
3997 ; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm14, %zmm8
3998 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [2,10,0,0,3,11,0,0]
3999 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4
4000 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm7, %zmm4
4001 ; AVX512DQ-BW-NEXT: movb $-52, %al
4002 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1
4003 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1}
4004 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,8,0,0,1,9]
4005 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm10
4006 ; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm16, %zmm10
4007 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,8,0,0,1,9,0,0]
4008 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8
4009 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm8
4010 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1}
4011 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,6,14,0,0,7,15]
4012 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm20
4013 ; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm18, %zmm20
4014 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [6,14,0,0,7,15,0,0]
4015 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm10
4016 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm15, %zmm10
4017 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1}
4018 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,4,12,0,0,5,13]
4019 ; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm20, %zmm22
4020 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [4,12,0,0,5,13,0,0]
4021 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm21, %zmm0
4022 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1}
4023 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm22
4024 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm14, %zmm22
4025 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm17
4026 ; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm7, %zmm17
4027 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm17 {%k1}
4028 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm24
4029 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm16, %zmm24
4030 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm22
4031 ; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm11, %zmm22
4032 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1}
4033 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm27
4034 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm18, %zmm27
4035 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm24
4036 ; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm15, %zmm24
4037 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm24 {%k1}
4038 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm20, %zmm25
4039 ; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm21, %zmm1
4040 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1}
4041 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm23
4042 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm14, %zmm23
4043 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm25
4044 ; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm7, %zmm25
4045 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm25 {%k1}
4046 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm23
4047 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm16, %zmm23
4048 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm26
4049 ; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm11, %zmm26
4050 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1}
4051 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm23
4052 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm18, %zmm23
4053 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm27
4054 ; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm15, %zmm27
4055 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1}
4056 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm20, %zmm13
4057 ; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm21, %zmm2
4058 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm2 {%k1}
4059 ; AVX512DQ-BW-NEXT: vpermi2q %zmm9, %zmm6, %zmm14
4060 ; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm7
4061 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1}
4062 ; AVX512DQ-BW-NEXT: vpermi2q %zmm9, %zmm6, %zmm16
4063 ; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm11
4064 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm11 {%k1}
4065 ; AVX512DQ-BW-NEXT: vpermi2q %zmm9, %zmm6, %zmm18
4066 ; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm15
4067 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1}
4068 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm20, %zmm6
4069 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm3
4070 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
4071 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 896(%r8)
4072 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 960(%r8)
4073 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 768(%r8)
4074 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 832(%r8)
4075 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 640(%r8)
4076 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 704(%r8)
4077 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 512(%r8)
4078 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 576(%r8)
4079 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 384(%r8)
4080 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 448(%r8)
4081 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 256(%r8)
4082 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 320(%r8)
4083 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 128(%r8)
4084 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 192(%r8)
4085 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%r8)
4086 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 64(%r8)
4087 ; AVX512DQ-BW-NEXT: vzeroupper
4088 ; AVX512DQ-BW-NEXT: retq
4090 ; AVX512DQ-BW-FCP-LABEL: store_i64_stride4_vf32:
4091 ; AVX512DQ-BW-FCP: # %bb.0:
4092 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
4093 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
4094 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
4095 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
4096 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm17
4097 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm23
4098 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm12
4099 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm5
4100 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm22
4101 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm25
4102 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm13
4103 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm6
4104 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm21
4105 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm26
4106 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm19
4107 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm9
4108 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,2,10,0,0,3,11]
4109 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm8
4110 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm14, %zmm8
4111 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [2,10,0,0,3,11,0,0]
4112 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4
4113 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm7, %zmm4
4114 ; AVX512DQ-BW-FCP-NEXT: movb $-52, %al
4115 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
4116 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1}
4117 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,8,0,0,1,9]
4118 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm10
4119 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm16, %zmm10
4120 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,8,0,0,1,9,0,0]
4121 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8
4122 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm8
4123 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1}
4124 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,6,14,0,0,7,15]
4125 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm20
4126 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm18, %zmm20
4127 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [6,14,0,0,7,15,0,0]
4128 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10
4129 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm15, %zmm10
4130 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1}
4131 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,4,12,0,0,5,13]
4132 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm20, %zmm22
4133 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [4,12,0,0,5,13,0,0]
4134 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm0
4135 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1}
4136 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm22
4137 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm14, %zmm22
4138 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17
4139 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm7, %zmm17
4140 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 {%k1}
4141 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm24
4142 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm16, %zmm24
4143 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm22
4144 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm11, %zmm22
4145 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1}
4146 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm27
4147 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm18, %zmm27
4148 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm24
4149 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm15, %zmm24
4150 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm24 {%k1}
4151 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm20, %zmm25
4152 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm21, %zmm1
4153 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1}
4154 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm23
4155 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm14, %zmm23
4156 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm25
4157 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm7, %zmm25
4158 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm25 {%k1}
4159 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm23
4160 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm16, %zmm23
4161 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm26
4162 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm11, %zmm26
4163 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1}
4164 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm23
4165 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm18, %zmm23
4166 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm27
4167 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm15, %zmm27
4168 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1}
4169 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm20, %zmm13
4170 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm2
4171 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 {%k1}
4172 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm6, %zmm14
4173 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm7
4174 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1}
4175 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm6, %zmm16
4176 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm11
4177 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm11 {%k1}
4178 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm6, %zmm18
4179 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm15
4180 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1}
4181 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm6
4182 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm3
4183 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
4184 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 896(%r8)
4185 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 960(%r8)
4186 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 768(%r8)
4187 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 832(%r8)
4188 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 640(%r8)
4189 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 704(%r8)
4190 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 512(%r8)
4191 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 576(%r8)
4192 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 384(%r8)
4193 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 448(%r8)
4194 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 256(%r8)
4195 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 320(%r8)
4196 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%r8)
4197 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%r8)
4198 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%r8)
4199 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8)
4200 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
4201 ; AVX512DQ-BW-FCP-NEXT: retq
4202 %in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64
4203 %in.vec1 = load <32 x i64>, ptr %in.vecptr1, align 64
4204 %in.vec2 = load <32 x i64>, ptr %in.vecptr2, align 64
4205 %in.vec3 = load <32 x i64>, ptr %in.vecptr3, align 64
4206 %1 = shufflevector <32 x i64> %in.vec0, <32 x i64> %in.vec1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
4207 %2 = shufflevector <32 x i64> %in.vec2, <32 x i64> %in.vec3, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
4208 %3 = shufflevector <64 x i64> %1, <64 x i64> %2, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
4209 %interleaved.vec = shufflevector <128 x i64> %3, <128 x i64> poison, <128 x i32> <i32 0, i32 32, i32 64, i32 96, i32 1, i32 33, i32 65, i32 97, i32 2, i32 34, i32 66, i32 98, i32 3, i32 35, i32 67, i32 99, i32 4, i32 36, i32 68, i32 100, i32 5, i32 37, i32 69, i32 101, i32 6, i32 38, i32 70, i32 102, i32 7, i32 39, i32 71, i32 103, i32 8, i32 40, i32 72, i32 104, i32 9, i32 41, i32 73, i32 105, i32 10, i32 42, i32 74, i32 106, i32 11, i32 43, i32 75, i32 107, i32 12, i32 44, i32 76, i32 108, i32 13, i32 45, i32 77, i32 109, i32 14, i32 46, i32 78, i32 110, i32 15, i32 47, i32 79, i32 111, i32 16, i32 48, i32 80, i32 112, i32 17, i32 49, i32 81, i32 113, i32 18, i32 50, i32 82, i32 114, i32 19, i32 51, i32 83, i32 115, i32 20, i32 52, i32 84, i32 116, i32 21, i32 53, i32 85, i32 117, i32 22, i32 54, i32 86, i32 118, i32 23, i32 55, i32 87, i32 119, i32 24, i32 56, i32 88, i32 120, i32 25, i32 57, i32 89, i32 121, i32 26, i32 58, i32 90, i32 122, i32 27, i32 59, i32 91, i32 123, i32 28, i32 60, i32 92, i32 124, i32 29, i32 61, i32 93, i32 125, i32 30, i32 62, i32 94, i32 126, i32 31, i32 63, i32 95, i32 127>
4210 store <128 x i64> %interleaved.vec, ptr %out.vec, align 64
4214 define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %out.vec) nounwind {
4215 ; SSE-LABEL: store_i64_stride4_vf64:
4217 ; SSE-NEXT: subq $1688, %rsp # imm = 0x698
4218 ; SSE-NEXT: movaps (%rdi), %xmm7
4219 ; SSE-NEXT: movaps 16(%rdi), %xmm8
4220 ; SSE-NEXT: movaps 32(%rdi), %xmm9
4221 ; SSE-NEXT: movaps 48(%rdi), %xmm10
4222 ; SSE-NEXT: movaps (%rsi), %xmm3
4223 ; SSE-NEXT: movaps 16(%rsi), %xmm2
4224 ; SSE-NEXT: movaps 32(%rsi), %xmm1
4225 ; SSE-NEXT: movaps 48(%rsi), %xmm0
4226 ; SSE-NEXT: movaps (%rdx), %xmm11
4227 ; SSE-NEXT: movaps 16(%rdx), %xmm12
4228 ; SSE-NEXT: movaps 32(%rdx), %xmm13
4229 ; SSE-NEXT: movaps 48(%rdx), %xmm14
4230 ; SSE-NEXT: movaps (%rcx), %xmm4
4231 ; SSE-NEXT: movaps 16(%rcx), %xmm5
4232 ; SSE-NEXT: movaps 32(%rcx), %xmm6
4233 ; SSE-NEXT: movaps %xmm7, %xmm15
4234 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm3[0]
4235 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4236 ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1]
4237 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4238 ; SSE-NEXT: movaps %xmm11, %xmm3
4239 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
4240 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4241 ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm4[1]
4242 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4243 ; SSE-NEXT: movaps %xmm8, %xmm3
4244 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
4245 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4246 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1]
4247 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4248 ; SSE-NEXT: movaps %xmm12, %xmm2
4249 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm5[0]
4250 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4251 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1]
4252 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4253 ; SSE-NEXT: movaps %xmm9, %xmm2
4254 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
4255 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4256 ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm1[1]
4257 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4258 ; SSE-NEXT: movaps %xmm13, %xmm1
4259 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0]
4260 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4261 ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm6[1]
4262 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4263 ; SSE-NEXT: movaps %xmm10, %xmm1
4264 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4265 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4266 ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1]
4267 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4268 ; SSE-NEXT: movaps 48(%rcx), %xmm0
4269 ; SSE-NEXT: movaps %xmm14, %xmm1
4270 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4271 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4272 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1]
4273 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4274 ; SSE-NEXT: movaps 64(%rdi), %xmm2
4275 ; SSE-NEXT: movaps 64(%rsi), %xmm0
4276 ; SSE-NEXT: movaps %xmm2, %xmm1
4277 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4278 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4279 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4280 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4281 ; SSE-NEXT: movaps 64(%rdx), %xmm2
4282 ; SSE-NEXT: movaps 64(%rcx), %xmm0
4283 ; SSE-NEXT: movaps %xmm2, %xmm1
4284 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4285 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4286 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4287 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4288 ; SSE-NEXT: movaps 80(%rdi), %xmm2
4289 ; SSE-NEXT: movaps 80(%rsi), %xmm0
4290 ; SSE-NEXT: movaps %xmm2, %xmm1
4291 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4292 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4293 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4294 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4295 ; SSE-NEXT: movaps 80(%rdx), %xmm2
4296 ; SSE-NEXT: movaps 80(%rcx), %xmm0
4297 ; SSE-NEXT: movaps %xmm2, %xmm1
4298 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4299 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4300 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4301 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4302 ; SSE-NEXT: movaps 96(%rdi), %xmm2
4303 ; SSE-NEXT: movaps 96(%rsi), %xmm0
4304 ; SSE-NEXT: movaps %xmm2, %xmm1
4305 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4306 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4307 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4308 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4309 ; SSE-NEXT: movaps 96(%rdx), %xmm2
4310 ; SSE-NEXT: movaps 96(%rcx), %xmm0
4311 ; SSE-NEXT: movaps %xmm2, %xmm1
4312 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4313 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4314 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4315 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4316 ; SSE-NEXT: movaps 112(%rdi), %xmm2
4317 ; SSE-NEXT: movaps 112(%rsi), %xmm0
4318 ; SSE-NEXT: movaps %xmm2, %xmm1
4319 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4320 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4321 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4322 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4323 ; SSE-NEXT: movaps 112(%rdx), %xmm2
4324 ; SSE-NEXT: movaps 112(%rcx), %xmm0
4325 ; SSE-NEXT: movaps %xmm2, %xmm1
4326 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4327 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4328 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4329 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4330 ; SSE-NEXT: movaps 128(%rdi), %xmm2
4331 ; SSE-NEXT: movaps 128(%rsi), %xmm0
4332 ; SSE-NEXT: movaps %xmm2, %xmm1
4333 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4334 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4335 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4336 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4337 ; SSE-NEXT: movaps 128(%rdx), %xmm2
4338 ; SSE-NEXT: movaps 128(%rcx), %xmm0
4339 ; SSE-NEXT: movaps %xmm2, %xmm1
4340 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4341 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4342 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4343 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4344 ; SSE-NEXT: movaps 144(%rdi), %xmm2
4345 ; SSE-NEXT: movaps 144(%rsi), %xmm0
4346 ; SSE-NEXT: movaps %xmm2, %xmm1
4347 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4348 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4349 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4350 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4351 ; SSE-NEXT: movaps 144(%rdx), %xmm2
4352 ; SSE-NEXT: movaps 144(%rcx), %xmm0
4353 ; SSE-NEXT: movaps %xmm2, %xmm1
4354 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4355 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4356 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4357 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4358 ; SSE-NEXT: movaps 160(%rdi), %xmm2
4359 ; SSE-NEXT: movaps 160(%rsi), %xmm0
4360 ; SSE-NEXT: movaps %xmm2, %xmm1
4361 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4362 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4363 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4364 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4365 ; SSE-NEXT: movaps 160(%rdx), %xmm2
4366 ; SSE-NEXT: movaps 160(%rcx), %xmm0
4367 ; SSE-NEXT: movaps %xmm2, %xmm1
4368 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4369 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4370 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4371 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4372 ; SSE-NEXT: movaps 176(%rdi), %xmm2
4373 ; SSE-NEXT: movaps 176(%rsi), %xmm0
4374 ; SSE-NEXT: movaps %xmm2, %xmm1
4375 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4376 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4377 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4378 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4379 ; SSE-NEXT: movaps 176(%rdx), %xmm2
4380 ; SSE-NEXT: movaps 176(%rcx), %xmm0
4381 ; SSE-NEXT: movaps %xmm2, %xmm1
4382 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4383 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4384 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4385 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4386 ; SSE-NEXT: movaps 192(%rdi), %xmm2
4387 ; SSE-NEXT: movaps 192(%rsi), %xmm0
4388 ; SSE-NEXT: movaps %xmm2, %xmm1
4389 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4390 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4391 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4392 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4393 ; SSE-NEXT: movaps 192(%rdx), %xmm2
4394 ; SSE-NEXT: movaps 192(%rcx), %xmm0
4395 ; SSE-NEXT: movaps %xmm2, %xmm1
4396 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4397 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4398 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4399 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4400 ; SSE-NEXT: movaps 208(%rdi), %xmm2
4401 ; SSE-NEXT: movaps 208(%rsi), %xmm0
4402 ; SSE-NEXT: movaps %xmm2, %xmm1
4403 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4404 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4405 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4406 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4407 ; SSE-NEXT: movaps 208(%rdx), %xmm2
4408 ; SSE-NEXT: movaps 208(%rcx), %xmm0
4409 ; SSE-NEXT: movaps %xmm2, %xmm1
4410 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4411 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4412 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4413 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4414 ; SSE-NEXT: movaps 224(%rdi), %xmm2
4415 ; SSE-NEXT: movaps 224(%rsi), %xmm0
4416 ; SSE-NEXT: movaps %xmm2, %xmm1
4417 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4418 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4419 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4420 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4421 ; SSE-NEXT: movaps 224(%rdx), %xmm2
4422 ; SSE-NEXT: movaps 224(%rcx), %xmm0
4423 ; SSE-NEXT: movaps %xmm2, %xmm1
4424 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4425 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4426 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4427 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4428 ; SSE-NEXT: movaps 240(%rdi), %xmm2
4429 ; SSE-NEXT: movaps 240(%rsi), %xmm0
4430 ; SSE-NEXT: movaps %xmm2, %xmm1
4431 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4432 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4433 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4434 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4435 ; SSE-NEXT: movaps 240(%rdx), %xmm2
4436 ; SSE-NEXT: movaps 240(%rcx), %xmm0
4437 ; SSE-NEXT: movaps %xmm2, %xmm1
4438 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4439 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4440 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4441 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4442 ; SSE-NEXT: movaps 256(%rdi), %xmm2
4443 ; SSE-NEXT: movaps 256(%rsi), %xmm0
4444 ; SSE-NEXT: movaps %xmm2, %xmm1
4445 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4446 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4447 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4448 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4449 ; SSE-NEXT: movaps 256(%rdx), %xmm2
4450 ; SSE-NEXT: movaps 256(%rcx), %xmm0
4451 ; SSE-NEXT: movaps %xmm2, %xmm1
4452 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4453 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4454 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4455 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4456 ; SSE-NEXT: movaps 272(%rdi), %xmm2
4457 ; SSE-NEXT: movaps 272(%rsi), %xmm0
4458 ; SSE-NEXT: movaps %xmm2, %xmm1
4459 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4460 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4461 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4462 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4463 ; SSE-NEXT: movaps 272(%rdx), %xmm2
4464 ; SSE-NEXT: movaps 272(%rcx), %xmm0
4465 ; SSE-NEXT: movaps %xmm2, %xmm1
4466 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4467 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4468 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4469 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4470 ; SSE-NEXT: movaps 288(%rdi), %xmm2
4471 ; SSE-NEXT: movaps 288(%rsi), %xmm0
4472 ; SSE-NEXT: movaps %xmm2, %xmm1
4473 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4474 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4475 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4476 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4477 ; SSE-NEXT: movaps 288(%rdx), %xmm2
4478 ; SSE-NEXT: movaps 288(%rcx), %xmm0
4479 ; SSE-NEXT: movaps %xmm2, %xmm1
4480 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4481 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4482 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4483 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4484 ; SSE-NEXT: movaps 304(%rdi), %xmm2
4485 ; SSE-NEXT: movaps 304(%rsi), %xmm0
4486 ; SSE-NEXT: movaps %xmm2, %xmm1
4487 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4488 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4489 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4490 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4491 ; SSE-NEXT: movaps 304(%rdx), %xmm2
4492 ; SSE-NEXT: movaps 304(%rcx), %xmm0
4493 ; SSE-NEXT: movaps %xmm2, %xmm1
4494 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4495 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4496 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4497 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4498 ; SSE-NEXT: movaps 320(%rdi), %xmm2
4499 ; SSE-NEXT: movaps 320(%rsi), %xmm0
4500 ; SSE-NEXT: movaps %xmm2, %xmm1
4501 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4502 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4503 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4504 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4505 ; SSE-NEXT: movaps 320(%rdx), %xmm2
4506 ; SSE-NEXT: movaps 320(%rcx), %xmm0
4507 ; SSE-NEXT: movaps %xmm2, %xmm1
4508 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4509 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4510 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4511 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4512 ; SSE-NEXT: movaps 336(%rdi), %xmm2
4513 ; SSE-NEXT: movaps 336(%rsi), %xmm0
4514 ; SSE-NEXT: movaps %xmm2, %xmm1
4515 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4516 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4517 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4518 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4519 ; SSE-NEXT: movaps 336(%rdx), %xmm2
4520 ; SSE-NEXT: movaps 336(%rcx), %xmm0
4521 ; SSE-NEXT: movaps %xmm2, %xmm1
4522 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4523 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4524 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4525 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4526 ; SSE-NEXT: movaps 352(%rdi), %xmm2
4527 ; SSE-NEXT: movaps 352(%rsi), %xmm0
4528 ; SSE-NEXT: movaps %xmm2, %xmm1
4529 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4530 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4531 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4532 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4533 ; SSE-NEXT: movaps 352(%rdx), %xmm2
4534 ; SSE-NEXT: movaps 352(%rcx), %xmm0
4535 ; SSE-NEXT: movaps %xmm2, %xmm1
4536 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4537 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4538 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4539 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4540 ; SSE-NEXT: movaps 368(%rdi), %xmm2
4541 ; SSE-NEXT: movaps 368(%rsi), %xmm0
4542 ; SSE-NEXT: movaps %xmm2, %xmm1
4543 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4544 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4545 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4546 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4547 ; SSE-NEXT: movaps 368(%rdx), %xmm2
4548 ; SSE-NEXT: movaps 368(%rcx), %xmm0
4549 ; SSE-NEXT: movaps %xmm2, %xmm1
4550 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4551 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4552 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4553 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4554 ; SSE-NEXT: movaps 384(%rdi), %xmm2
4555 ; SSE-NEXT: movaps 384(%rsi), %xmm0
4556 ; SSE-NEXT: movaps %xmm2, %xmm1
4557 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4558 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4559 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4560 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4561 ; SSE-NEXT: movaps 384(%rdx), %xmm2
4562 ; SSE-NEXT: movaps 384(%rcx), %xmm0
4563 ; SSE-NEXT: movaps %xmm2, %xmm1
4564 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4565 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4566 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4567 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4568 ; SSE-NEXT: movaps 400(%rdi), %xmm2
4569 ; SSE-NEXT: movaps 400(%rsi), %xmm0
4570 ; SSE-NEXT: movaps %xmm2, %xmm1
4571 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4572 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4573 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4574 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4575 ; SSE-NEXT: movaps 400(%rdx), %xmm2
4576 ; SSE-NEXT: movaps 400(%rcx), %xmm0
4577 ; SSE-NEXT: movaps %xmm2, %xmm1
4578 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4579 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4580 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4581 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4582 ; SSE-NEXT: movaps 416(%rdi), %xmm2
4583 ; SSE-NEXT: movaps 416(%rsi), %xmm0
4584 ; SSE-NEXT: movaps %xmm2, %xmm1
4585 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4586 ; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
4587 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4588 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4589 ; SSE-NEXT: movaps 416(%rdx), %xmm2
4590 ; SSE-NEXT: movaps 416(%rcx), %xmm0
4591 ; SSE-NEXT: movaps %xmm2, %xmm1
4592 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4593 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4594 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4595 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4596 ; SSE-NEXT: movaps 432(%rdi), %xmm2
4597 ; SSE-NEXT: movaps 432(%rsi), %xmm0
4598 ; SSE-NEXT: movaps %xmm2, %xmm1
4599 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4600 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4601 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4602 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4603 ; SSE-NEXT: movaps 432(%rdx), %xmm15
4604 ; SSE-NEXT: movaps 432(%rcx), %xmm0
4605 ; SSE-NEXT: movaps %xmm15, %xmm1
4606 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4607 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4608 ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1]
4609 ; SSE-NEXT: movaps 448(%rdi), %xmm12
4610 ; SSE-NEXT: movaps 448(%rsi), %xmm0
4611 ; SSE-NEXT: movaps %xmm12, %xmm1
4612 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4613 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4614 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1]
4615 ; SSE-NEXT: movaps 448(%rdx), %xmm11
4616 ; SSE-NEXT: movaps 448(%rcx), %xmm0
4617 ; SSE-NEXT: movaps %xmm11, %xmm1
4618 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4619 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4620 ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1]
4621 ; SSE-NEXT: movaps 464(%rdi), %xmm13
4622 ; SSE-NEXT: movaps 464(%rsi), %xmm0
4623 ; SSE-NEXT: movaps %xmm13, %xmm14
4624 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0]
4625 ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1]
4626 ; SSE-NEXT: movaps 464(%rdx), %xmm7
4627 ; SSE-NEXT: movaps 464(%rcx), %xmm1
4628 ; SSE-NEXT: movaps %xmm7, %xmm10
4629 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0]
4630 ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1]
4631 ; SSE-NEXT: movaps 480(%rdi), %xmm8
4632 ; SSE-NEXT: movaps 480(%rsi), %xmm0
4633 ; SSE-NEXT: movaps %xmm8, %xmm9
4634 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0]
4635 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1]
4636 ; SSE-NEXT: movaps 480(%rdx), %xmm5
4637 ; SSE-NEXT: movaps 480(%rcx), %xmm0
4638 ; SSE-NEXT: movaps %xmm5, %xmm6
4639 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0]
4640 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
4641 ; SSE-NEXT: movaps 496(%rdi), %xmm2
4642 ; SSE-NEXT: movaps 496(%rsi), %xmm3
4643 ; SSE-NEXT: movaps %xmm2, %xmm4
4644 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0]
4645 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
4646 ; SSE-NEXT: movaps 496(%rdx), %xmm3
4647 ; SSE-NEXT: movaps 496(%rcx), %xmm1
4648 ; SSE-NEXT: movaps %xmm3, %xmm0
4649 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4650 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
4651 ; SSE-NEXT: movaps %xmm3, 2032(%r8)
4652 ; SSE-NEXT: movaps %xmm2, 2016(%r8)
4653 ; SSE-NEXT: movaps %xmm0, 2000(%r8)
4654 ; SSE-NEXT: movaps %xmm4, 1984(%r8)
4655 ; SSE-NEXT: movaps %xmm5, 1968(%r8)
4656 ; SSE-NEXT: movaps %xmm8, 1952(%r8)
4657 ; SSE-NEXT: movaps %xmm6, 1936(%r8)
4658 ; SSE-NEXT: movaps %xmm9, 1920(%r8)
4659 ; SSE-NEXT: movaps %xmm7, 1904(%r8)
4660 ; SSE-NEXT: movaps %xmm13, 1888(%r8)
4661 ; SSE-NEXT: movaps %xmm10, 1872(%r8)
4662 ; SSE-NEXT: movaps %xmm14, 1856(%r8)
4663 ; SSE-NEXT: movaps %xmm11, 1840(%r8)
4664 ; SSE-NEXT: movaps %xmm12, 1824(%r8)
4665 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4666 ; SSE-NEXT: movaps %xmm0, 1808(%r8)
4667 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4668 ; SSE-NEXT: movaps %xmm0, 1792(%r8)
4669 ; SSE-NEXT: movaps %xmm15, 1776(%r8)
4670 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4671 ; SSE-NEXT: movaps %xmm0, 1760(%r8)
4672 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4673 ; SSE-NEXT: movaps %xmm0, 1744(%r8)
4674 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4675 ; SSE-NEXT: movaps %xmm0, 1728(%r8)
4676 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4677 ; SSE-NEXT: movaps %xmm0, 1712(%r8)
4678 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4679 ; SSE-NEXT: movaps %xmm0, 1696(%r8)
4680 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4681 ; SSE-NEXT: movaps %xmm0, 1680(%r8)
4682 ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
4683 ; SSE-NEXT: movaps %xmm0, 1664(%r8)
4684 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4685 ; SSE-NEXT: movaps %xmm0, 1648(%r8)
4686 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4687 ; SSE-NEXT: movaps %xmm0, 1632(%r8)
4688 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4689 ; SSE-NEXT: movaps %xmm0, 1616(%r8)
4690 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4691 ; SSE-NEXT: movaps %xmm0, 1600(%r8)
4692 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4693 ; SSE-NEXT: movaps %xmm0, 1584(%r8)
4694 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4695 ; SSE-NEXT: movaps %xmm0, 1568(%r8)
4696 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4697 ; SSE-NEXT: movaps %xmm0, 1552(%r8)
4698 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4699 ; SSE-NEXT: movaps %xmm0, 1536(%r8)
4700 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4701 ; SSE-NEXT: movaps %xmm0, 1520(%r8)
4702 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4703 ; SSE-NEXT: movaps %xmm0, 1504(%r8)
4704 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4705 ; SSE-NEXT: movaps %xmm0, 1488(%r8)
4706 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4707 ; SSE-NEXT: movaps %xmm0, 1472(%r8)
4708 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4709 ; SSE-NEXT: movaps %xmm0, 1456(%r8)
4710 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4711 ; SSE-NEXT: movaps %xmm0, 1440(%r8)
4712 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4713 ; SSE-NEXT: movaps %xmm0, 1424(%r8)
4714 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4715 ; SSE-NEXT: movaps %xmm0, 1408(%r8)
4716 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4717 ; SSE-NEXT: movaps %xmm0, 1392(%r8)
4718 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4719 ; SSE-NEXT: movaps %xmm0, 1376(%r8)
4720 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4721 ; SSE-NEXT: movaps %xmm0, 1360(%r8)
4722 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4723 ; SSE-NEXT: movaps %xmm0, 1344(%r8)
4724 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4725 ; SSE-NEXT: movaps %xmm0, 1328(%r8)
4726 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4727 ; SSE-NEXT: movaps %xmm0, 1312(%r8)
4728 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4729 ; SSE-NEXT: movaps %xmm0, 1296(%r8)
4730 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4731 ; SSE-NEXT: movaps %xmm0, 1280(%r8)
4732 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4733 ; SSE-NEXT: movaps %xmm0, 1264(%r8)
4734 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4735 ; SSE-NEXT: movaps %xmm0, 1248(%r8)
4736 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4737 ; SSE-NEXT: movaps %xmm0, 1232(%r8)
4738 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4739 ; SSE-NEXT: movaps %xmm0, 1216(%r8)
4740 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4741 ; SSE-NEXT: movaps %xmm0, 1200(%r8)
4742 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4743 ; SSE-NEXT: movaps %xmm0, 1184(%r8)
4744 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4745 ; SSE-NEXT: movaps %xmm0, 1168(%r8)
4746 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4747 ; SSE-NEXT: movaps %xmm0, 1152(%r8)
4748 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4749 ; SSE-NEXT: movaps %xmm0, 1136(%r8)
4750 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4751 ; SSE-NEXT: movaps %xmm0, 1120(%r8)
4752 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4753 ; SSE-NEXT: movaps %xmm0, 1104(%r8)
4754 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4755 ; SSE-NEXT: movaps %xmm0, 1088(%r8)
4756 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4757 ; SSE-NEXT: movaps %xmm0, 1072(%r8)
4758 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4759 ; SSE-NEXT: movaps %xmm0, 1056(%r8)
4760 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4761 ; SSE-NEXT: movaps %xmm0, 1040(%r8)
4762 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4763 ; SSE-NEXT: movaps %xmm0, 1024(%r8)
4764 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4765 ; SSE-NEXT: movaps %xmm0, 1008(%r8)
4766 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4767 ; SSE-NEXT: movaps %xmm0, 992(%r8)
4768 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4769 ; SSE-NEXT: movaps %xmm0, 976(%r8)
4770 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4771 ; SSE-NEXT: movaps %xmm0, 960(%r8)
4772 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4773 ; SSE-NEXT: movaps %xmm0, 944(%r8)
4774 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4775 ; SSE-NEXT: movaps %xmm0, 928(%r8)
4776 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4777 ; SSE-NEXT: movaps %xmm0, 912(%r8)
4778 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4779 ; SSE-NEXT: movaps %xmm0, 896(%r8)
4780 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4781 ; SSE-NEXT: movaps %xmm0, 880(%r8)
4782 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4783 ; SSE-NEXT: movaps %xmm0, 864(%r8)
4784 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4785 ; SSE-NEXT: movaps %xmm0, 848(%r8)
4786 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4787 ; SSE-NEXT: movaps %xmm0, 832(%r8)
4788 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4789 ; SSE-NEXT: movaps %xmm0, 816(%r8)
4790 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4791 ; SSE-NEXT: movaps %xmm0, 800(%r8)
4792 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4793 ; SSE-NEXT: movaps %xmm0, 784(%r8)
4794 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4795 ; SSE-NEXT: movaps %xmm0, 768(%r8)
4796 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4797 ; SSE-NEXT: movaps %xmm0, 752(%r8)
4798 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4799 ; SSE-NEXT: movaps %xmm0, 736(%r8)
4800 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4801 ; SSE-NEXT: movaps %xmm0, 720(%r8)
4802 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4803 ; SSE-NEXT: movaps %xmm0, 704(%r8)
4804 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4805 ; SSE-NEXT: movaps %xmm0, 688(%r8)
4806 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4807 ; SSE-NEXT: movaps %xmm0, 672(%r8)
4808 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4809 ; SSE-NEXT: movaps %xmm0, 656(%r8)
4810 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4811 ; SSE-NEXT: movaps %xmm0, 640(%r8)
4812 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4813 ; SSE-NEXT: movaps %xmm0, 624(%r8)
4814 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4815 ; SSE-NEXT: movaps %xmm0, 608(%r8)
4816 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4817 ; SSE-NEXT: movaps %xmm0, 592(%r8)
4818 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4819 ; SSE-NEXT: movaps %xmm0, 576(%r8)
4820 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4821 ; SSE-NEXT: movaps %xmm0, 560(%r8)
4822 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4823 ; SSE-NEXT: movaps %xmm0, 544(%r8)
4824 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4825 ; SSE-NEXT: movaps %xmm0, 528(%r8)
4826 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4827 ; SSE-NEXT: movaps %xmm0, 512(%r8)
4828 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4829 ; SSE-NEXT: movaps %xmm0, 496(%r8)
4830 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4831 ; SSE-NEXT: movaps %xmm0, 480(%r8)
4832 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4833 ; SSE-NEXT: movaps %xmm0, 464(%r8)
4834 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4835 ; SSE-NEXT: movaps %xmm0, 448(%r8)
4836 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4837 ; SSE-NEXT: movaps %xmm0, 432(%r8)
4838 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4839 ; SSE-NEXT: movaps %xmm0, 416(%r8)
4840 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4841 ; SSE-NEXT: movaps %xmm0, 400(%r8)
4842 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4843 ; SSE-NEXT: movaps %xmm0, 384(%r8)
4844 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4845 ; SSE-NEXT: movaps %xmm0, 368(%r8)
4846 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4847 ; SSE-NEXT: movaps %xmm0, 352(%r8)
4848 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4849 ; SSE-NEXT: movaps %xmm0, 336(%r8)
4850 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4851 ; SSE-NEXT: movaps %xmm0, 320(%r8)
4852 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4853 ; SSE-NEXT: movaps %xmm0, 304(%r8)
4854 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4855 ; SSE-NEXT: movaps %xmm0, 288(%r8)
4856 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4857 ; SSE-NEXT: movaps %xmm0, 272(%r8)
4858 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4859 ; SSE-NEXT: movaps %xmm0, 256(%r8)
4860 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4861 ; SSE-NEXT: movaps %xmm0, 240(%r8)
4862 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4863 ; SSE-NEXT: movaps %xmm0, 224(%r8)
4864 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4865 ; SSE-NEXT: movaps %xmm0, 208(%r8)
4866 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4867 ; SSE-NEXT: movaps %xmm0, 192(%r8)
4868 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4869 ; SSE-NEXT: movaps %xmm0, 176(%r8)
4870 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4871 ; SSE-NEXT: movaps %xmm0, 160(%r8)
4872 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4873 ; SSE-NEXT: movaps %xmm0, 144(%r8)
4874 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4875 ; SSE-NEXT: movaps %xmm0, 128(%r8)
4876 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4877 ; SSE-NEXT: movaps %xmm0, 112(%r8)
4878 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4879 ; SSE-NEXT: movaps %xmm0, 96(%r8)
4880 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4881 ; SSE-NEXT: movaps %xmm0, 80(%r8)
4882 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4883 ; SSE-NEXT: movaps %xmm0, 64(%r8)
4884 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4885 ; SSE-NEXT: movaps %xmm0, 48(%r8)
4886 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4887 ; SSE-NEXT: movaps %xmm0, 32(%r8)
4888 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4889 ; SSE-NEXT: movaps %xmm0, 16(%r8)
4890 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4891 ; SSE-NEXT: movaps %xmm0, (%r8)
4892 ; SSE-NEXT: addq $1688, %rsp # imm = 0x698
4895 ; AVX-LABEL: store_i64_stride4_vf64:
4897 ; AVX-NEXT: subq $1688, %rsp # imm = 0x698
4898 ; AVX-NEXT: vmovaps 160(%rdx), %ymm0
4899 ; AVX-NEXT: vmovaps 128(%rdx), %ymm1
4900 ; AVX-NEXT: vmovaps 96(%rdx), %ymm3
4901 ; AVX-NEXT: vmovaps 64(%rdx), %ymm5
4902 ; AVX-NEXT: vmovaps 32(%rdx), %ymm7
4903 ; AVX-NEXT: vmovaps (%rdx), %ymm8
4904 ; AVX-NEXT: vmovaps 160(%rcx), %ymm2
4905 ; AVX-NEXT: vmovaps 128(%rcx), %ymm4
4906 ; AVX-NEXT: vmovaps 96(%rcx), %ymm6
4907 ; AVX-NEXT: vmovaps 64(%rcx), %ymm9
4908 ; AVX-NEXT: vmovaps 32(%rcx), %ymm10
4909 ; AVX-NEXT: vmovaps (%rcx), %ymm11
4910 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm8[0],ymm11[0],ymm8[2],ymm11[2]
4911 ; AVX-NEXT: vmovaps 16(%rsi), %xmm13
4912 ; AVX-NEXT: vmovaps 16(%rdi), %xmm14
4913 ; AVX-NEXT: vmovlhps {{.*#+}} xmm15 = xmm14[0],xmm13[0]
4914 ; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7]
4915 ; AVX-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4916 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm11[1],ymm8[3],ymm11[3]
4917 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm14[1],xmm13[1]
4918 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7]
4919 ; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4920 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm10[0],ymm7[2],ymm10[2]
4921 ; AVX-NEXT: vmovaps 48(%rsi), %xmm11
4922 ; AVX-NEXT: vmovaps 48(%rdi), %xmm12
4923 ; AVX-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm11[0]
4924 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
4925 ; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4926 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm10[1],ymm7[3],ymm10[3]
4927 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm12[1],xmm11[1]
4928 ; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
4929 ; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4930 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm5[0],ymm9[0],ymm5[2],ymm9[2]
4931 ; AVX-NEXT: vmovaps 80(%rsi), %xmm8
4932 ; AVX-NEXT: vmovaps 80(%rdi), %xmm10
4933 ; AVX-NEXT: vmovlhps {{.*#+}} xmm11 = xmm10[0],xmm8[0]
4934 ; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7]
4935 ; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4936 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm9[1],ymm5[3],ymm9[3]
4937 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm10[1],xmm8[1]
4938 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7]
4939 ; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4940 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm3[0],ymm6[0],ymm3[2],ymm6[2]
4941 ; AVX-NEXT: vmovaps 112(%rsi), %xmm7
4942 ; AVX-NEXT: vmovaps 112(%rdi), %xmm8
4943 ; AVX-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm7[0]
4944 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
4945 ; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4946 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm6[1],ymm3[3],ymm6[3]
4947 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm8[1],xmm7[1]
4948 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
4949 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4950 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm4[0],ymm1[2],ymm4[2]
4951 ; AVX-NEXT: vmovaps 144(%rsi), %xmm5
4952 ; AVX-NEXT: vmovaps 144(%rdi), %xmm6
4953 ; AVX-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm5[0]
4954 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7]
4955 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4956 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3]
4957 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm5[1]
4958 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
4959 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4960 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
4961 ; AVX-NEXT: vmovaps 176(%rsi), %xmm3
4962 ; AVX-NEXT: vmovaps 176(%rdi), %xmm4
4963 ; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm3[0]
4964 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
4965 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4966 ; AVX-NEXT: vmovaps 192(%rdx), %ymm1
4967 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
4968 ; AVX-NEXT: vmovaps 192(%rcx), %ymm2
4969 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1]
4970 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
4971 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4972 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
4973 ; AVX-NEXT: vmovaps 208(%rsi), %xmm3
4974 ; AVX-NEXT: vmovaps 208(%rdi), %xmm4
4975 ; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm3[0]
4976 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
4977 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4978 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
4979 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm3[1]
4980 ; AVX-NEXT: vmovaps 224(%rdx), %ymm2
4981 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4982 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4983 ; AVX-NEXT: vmovaps 224(%rcx), %ymm0
4984 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
4985 ; AVX-NEXT: vmovaps 240(%rsi), %xmm3
4986 ; AVX-NEXT: vmovaps 240(%rdi), %xmm4
4987 ; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm3[0]
4988 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
4989 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4990 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
4991 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm3[1]
4992 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4993 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4994 ; AVX-NEXT: vmovaps 256(%rdx), %ymm0
4995 ; AVX-NEXT: vmovaps 256(%rcx), %ymm1
4996 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
4997 ; AVX-NEXT: vmovaps 272(%rsi), %xmm3
4998 ; AVX-NEXT: vmovaps 272(%rdi), %xmm4
4999 ; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm3[0]
5000 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
5001 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5002 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
5003 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm3[1]
5004 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5005 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5006 ; AVX-NEXT: vmovaps 288(%rdx), %ymm0
5007 ; AVX-NEXT: vmovaps 288(%rcx), %ymm1
5008 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
5009 ; AVX-NEXT: vmovaps 304(%rsi), %xmm3
5010 ; AVX-NEXT: vmovaps 304(%rdi), %xmm4
5011 ; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm3[0]
5012 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
5013 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5014 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
5015 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm3[1]
5016 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5017 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5018 ; AVX-NEXT: vmovaps 320(%rdx), %ymm0
5019 ; AVX-NEXT: vmovaps 320(%rcx), %ymm1
5020 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
5021 ; AVX-NEXT: vmovaps 336(%rsi), %xmm3
5022 ; AVX-NEXT: vmovaps 336(%rdi), %xmm4
5023 ; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm3[0]
5024 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
5025 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5026 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
5027 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm3[1]
5028 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5029 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5030 ; AVX-NEXT: vmovaps 352(%rdx), %ymm0
5031 ; AVX-NEXT: vmovaps 352(%rcx), %ymm1
5032 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
5033 ; AVX-NEXT: vmovaps 368(%rsi), %xmm3
5034 ; AVX-NEXT: vmovaps 368(%rdi), %xmm4
5035 ; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm3[0]
5036 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
5037 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5038 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
5039 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm3[1]
5040 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5041 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5042 ; AVX-NEXT: vmovaps 384(%rdx), %ymm0
5043 ; AVX-NEXT: vmovaps 384(%rcx), %ymm1
5044 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
5045 ; AVX-NEXT: vmovaps 400(%rsi), %xmm3
5046 ; AVX-NEXT: vmovaps 400(%rdi), %xmm4
5047 ; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm3[0]
5048 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
5049 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5050 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
5051 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm3[1]
5052 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5053 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5054 ; AVX-NEXT: vmovaps 416(%rdx), %ymm0
5055 ; AVX-NEXT: vmovaps 416(%rcx), %ymm1
5056 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
5057 ; AVX-NEXT: vmovaps 432(%rsi), %xmm3
5058 ; AVX-NEXT: vmovaps 432(%rdi), %xmm4
5059 ; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm3[0]
5060 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
5061 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5062 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
5063 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm3[1]
5064 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5065 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5066 ; AVX-NEXT: vmovaps 448(%rdx), %ymm0
5067 ; AVX-NEXT: vmovaps 448(%rcx), %ymm1
5068 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
5069 ; AVX-NEXT: vmovaps 464(%rsi), %xmm3
5070 ; AVX-NEXT: vmovaps 464(%rdi), %xmm4
5071 ; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm3[0]
5072 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
5073 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5074 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
5075 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm3[1]
5076 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5077 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5078 ; AVX-NEXT: vmovaps 480(%rdx), %ymm0
5079 ; AVX-NEXT: vmovaps 480(%rcx), %ymm1
5080 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
5081 ; AVX-NEXT: vmovaps 496(%rsi), %xmm3
5082 ; AVX-NEXT: vmovaps 496(%rdi), %xmm4
5083 ; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm3[0]
5084 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
5085 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5086 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
5087 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm3[1]
5088 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5089 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5090 ; AVX-NEXT: vmovaps 256(%rsi), %xmm0
5091 ; AVX-NEXT: vmovaps 256(%rdi), %xmm1
5092 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
5093 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5094 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
5095 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5096 ; AVX-NEXT: vmovaps 256(%rcx), %xmm0
5097 ; AVX-NEXT: vmovaps 256(%rdx), %xmm1
5098 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
5099 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5100 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
5101 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5102 ; AVX-NEXT: vmovaps 128(%rsi), %xmm0
5103 ; AVX-NEXT: vmovaps 128(%rdi), %xmm1
5104 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
5105 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5106 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
5107 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5108 ; AVX-NEXT: vmovaps 128(%rcx), %xmm0
5109 ; AVX-NEXT: vmovaps 128(%rdx), %xmm1
5110 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
5111 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5112 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
5113 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5114 ; AVX-NEXT: vmovaps 64(%rsi), %xmm0
5115 ; AVX-NEXT: vmovaps 64(%rdi), %xmm1
5116 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
5117 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5118 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
5119 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5120 ; AVX-NEXT: vmovaps 64(%rcx), %xmm0
5121 ; AVX-NEXT: vmovaps 64(%rdx), %xmm1
5122 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
5123 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5124 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
5125 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5126 ; AVX-NEXT: vmovaps 32(%rsi), %xmm0
5127 ; AVX-NEXT: vmovaps 32(%rdi), %xmm1
5128 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
5129 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5130 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
5131 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5132 ; AVX-NEXT: vmovaps 32(%rcx), %xmm0
5133 ; AVX-NEXT: vmovaps 32(%rdx), %xmm1
5134 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
5135 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5136 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
5137 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5138 ; AVX-NEXT: vmovaps 96(%rsi), %xmm0
5139 ; AVX-NEXT: vmovaps 96(%rdi), %xmm1
5140 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
5141 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5142 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
5143 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5144 ; AVX-NEXT: vmovaps 96(%rcx), %xmm0
5145 ; AVX-NEXT: vmovaps 96(%rdx), %xmm1
5146 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
5147 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5148 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
5149 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5150 ; AVX-NEXT: vmovaps 160(%rsi), %xmm0
5151 ; AVX-NEXT: vmovaps 160(%rdi), %xmm1
5152 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
5153 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5154 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
5155 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5156 ; AVX-NEXT: vmovaps 160(%rcx), %xmm0
5157 ; AVX-NEXT: vmovaps 160(%rdx), %xmm1
5158 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
5159 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5160 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
5161 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5162 ; AVX-NEXT: vmovaps 224(%rsi), %xmm0
5163 ; AVX-NEXT: vmovaps 224(%rdi), %xmm1
5164 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
5165 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5166 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
5167 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5168 ; AVX-NEXT: vmovaps 224(%rcx), %xmm0
5169 ; AVX-NEXT: vmovaps 224(%rdx), %xmm1
5170 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
5171 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5172 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
5173 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5174 ; AVX-NEXT: vmovaps 192(%rsi), %xmm0
5175 ; AVX-NEXT: vmovaps 192(%rdi), %xmm1
5176 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
5177 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5178 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
5179 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5180 ; AVX-NEXT: vmovaps 192(%rcx), %xmm0
5181 ; AVX-NEXT: vmovaps 192(%rdx), %xmm1
5182 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
5183 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5184 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
5185 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5186 ; AVX-NEXT: vmovaps 288(%rsi), %xmm0
5187 ; AVX-NEXT: vmovaps 288(%rdi), %xmm1
5188 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
5189 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5190 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
5191 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5192 ; AVX-NEXT: vmovaps 288(%rcx), %xmm0
5193 ; AVX-NEXT: vmovaps 288(%rdx), %xmm1
5194 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
5195 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5196 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
5197 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5198 ; AVX-NEXT: vmovaps 352(%rsi), %xmm0
5199 ; AVX-NEXT: vmovaps 352(%rdi), %xmm1
5200 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
5201 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5202 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
5203 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5204 ; AVX-NEXT: vmovaps 352(%rcx), %xmm0
5205 ; AVX-NEXT: vmovaps 352(%rdx), %xmm1
5206 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
5207 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5208 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
5209 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5210 ; AVX-NEXT: vmovaps 320(%rsi), %xmm0
5211 ; AVX-NEXT: vmovaps 320(%rdi), %xmm1
5212 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
5213 ; AVX-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill
5214 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
5215 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5216 ; AVX-NEXT: vmovaps 320(%rcx), %xmm0
5217 ; AVX-NEXT: vmovaps 320(%rdx), %xmm1
5218 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
5219 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5220 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
5221 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5222 ; AVX-NEXT: vmovaps 416(%rsi), %xmm0
5223 ; AVX-NEXT: vmovaps 416(%rdi), %xmm1
5224 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
5225 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5226 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
5227 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5228 ; AVX-NEXT: vmovaps 416(%rcx), %xmm0
5229 ; AVX-NEXT: vmovaps 416(%rdx), %xmm1
5230 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
5231 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5232 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
5233 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5234 ; AVX-NEXT: vmovaps 480(%rsi), %xmm0
5235 ; AVX-NEXT: vmovaps 480(%rdi), %xmm1
5236 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
5237 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5238 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm1[1],xmm0[1]
5239 ; AVX-NEXT: vmovaps 480(%rcx), %xmm0
5240 ; AVX-NEXT: vmovaps 480(%rdx), %xmm1
5241 ; AVX-NEXT: vmovlhps {{.*#+}} xmm14 = xmm1[0],xmm0[0]
5242 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm1[1],xmm0[1]
5243 ; AVX-NEXT: vmovaps 448(%rsi), %xmm1
5244 ; AVX-NEXT: vmovaps 448(%rdi), %xmm0
5245 ; AVX-NEXT: vmovlhps {{.*#+}} xmm12 = xmm0[0],xmm1[0]
5246 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm0[1],xmm1[1]
5247 ; AVX-NEXT: vmovaps 448(%rcx), %xmm1
5248 ; AVX-NEXT: vmovaps 448(%rdx), %xmm0
5249 ; AVX-NEXT: vmovlhps {{.*#+}} xmm10 = xmm0[0],xmm1[0]
5250 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm0[1],xmm1[1]
5251 ; AVX-NEXT: vmovaps 384(%rsi), %xmm1
5252 ; AVX-NEXT: vmovaps 384(%rdi), %xmm0
5253 ; AVX-NEXT: vmovlhps {{.*#+}} xmm8 = xmm0[0],xmm1[0]
5254 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm0[1],xmm1[1]
5255 ; AVX-NEXT: vmovaps 384(%rcx), %xmm1
5256 ; AVX-NEXT: vmovaps 384(%rdx), %xmm0
5257 ; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm0[0],xmm1[0]
5258 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm0[1],xmm1[1]
5259 ; AVX-NEXT: vmovaps (%rsi), %xmm1
5260 ; AVX-NEXT: vmovaps (%rdi), %xmm0
5261 ; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm1[0]
5262 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm0[1],xmm1[1]
5263 ; AVX-NEXT: vmovaps (%rcx), %xmm1
5264 ; AVX-NEXT: vmovaps (%rdx), %xmm0
5265 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
5266 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
5267 ; AVX-NEXT: vmovaps %xmm0, 48(%r8)
5268 ; AVX-NEXT: vmovaps %xmm3, 32(%r8)
5269 ; AVX-NEXT: vmovaps %xmm2, 16(%r8)
5270 ; AVX-NEXT: vmovaps %xmm4, (%r8)
5271 ; AVX-NEXT: vmovaps %xmm5, 1584(%r8)
5272 ; AVX-NEXT: vmovaps %xmm7, 1568(%r8)
5273 ; AVX-NEXT: vmovaps %xmm6, 1552(%r8)
5274 ; AVX-NEXT: vmovaps %xmm8, 1536(%r8)
5275 ; AVX-NEXT: vmovaps %xmm9, 1840(%r8)
5276 ; AVX-NEXT: vmovaps %xmm11, 1824(%r8)
5277 ; AVX-NEXT: vmovaps %xmm10, 1808(%r8)
5278 ; AVX-NEXT: vmovaps %xmm12, 1792(%r8)
5279 ; AVX-NEXT: vmovaps %xmm13, 1968(%r8)
5280 ; AVX-NEXT: vmovaps %xmm15, 1952(%r8)
5281 ; AVX-NEXT: vmovaps %xmm14, 1936(%r8)
5282 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5283 ; AVX-NEXT: vmovaps %xmm0, 1920(%r8)
5284 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5285 ; AVX-NEXT: vmovaps %xmm0, 1712(%r8)
5286 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5287 ; AVX-NEXT: vmovaps %xmm0, 1696(%r8)
5288 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5289 ; AVX-NEXT: vmovaps %xmm0, 1680(%r8)
5290 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5291 ; AVX-NEXT: vmovaps %xmm0, 1664(%r8)
5292 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5293 ; AVX-NEXT: vmovaps %xmm0, 1328(%r8)
5294 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5295 ; AVX-NEXT: vmovaps %xmm0, 1312(%r8)
5296 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5297 ; AVX-NEXT: vmovaps %xmm0, 1296(%r8)
5298 ; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
5299 ; AVX-NEXT: vmovaps %xmm0, 1280(%r8)
5300 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5301 ; AVX-NEXT: vmovaps %xmm0, 1456(%r8)
5302 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5303 ; AVX-NEXT: vmovaps %xmm0, 1440(%r8)
5304 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5305 ; AVX-NEXT: vmovaps %xmm0, 1424(%r8)
5306 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5307 ; AVX-NEXT: vmovaps %xmm0, 1408(%r8)
5308 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5309 ; AVX-NEXT: vmovaps %xmm0, 1200(%r8)
5310 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5311 ; AVX-NEXT: vmovaps %xmm0, 1184(%r8)
5312 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5313 ; AVX-NEXT: vmovaps %xmm0, 1168(%r8)
5314 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5315 ; AVX-NEXT: vmovaps %xmm0, 1152(%r8)
5316 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5317 ; AVX-NEXT: vmovaps %xmm0, 816(%r8)
5318 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5319 ; AVX-NEXT: vmovaps %xmm0, 800(%r8)
5320 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5321 ; AVX-NEXT: vmovaps %xmm0, 784(%r8)
5322 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5323 ; AVX-NEXT: vmovaps %xmm0, 768(%r8)
5324 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5325 ; AVX-NEXT: vmovaps %xmm0, 944(%r8)
5326 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5327 ; AVX-NEXT: vmovaps %xmm0, 928(%r8)
5328 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5329 ; AVX-NEXT: vmovaps %xmm0, 912(%r8)
5330 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5331 ; AVX-NEXT: vmovaps %xmm0, 896(%r8)
5332 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5333 ; AVX-NEXT: vmovaps %xmm0, 688(%r8)
5334 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5335 ; AVX-NEXT: vmovaps %xmm0, 672(%r8)
5336 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5337 ; AVX-NEXT: vmovaps %xmm0, 656(%r8)
5338 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5339 ; AVX-NEXT: vmovaps %xmm0, 640(%r8)
5340 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5341 ; AVX-NEXT: vmovaps %xmm0, 432(%r8)
5342 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5343 ; AVX-NEXT: vmovaps %xmm0, 416(%r8)
5344 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5345 ; AVX-NEXT: vmovaps %xmm0, 400(%r8)
5346 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5347 ; AVX-NEXT: vmovaps %xmm0, 384(%r8)
5348 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5349 ; AVX-NEXT: vmovaps %xmm0, 176(%r8)
5350 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5351 ; AVX-NEXT: vmovaps %xmm0, 160(%r8)
5352 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5353 ; AVX-NEXT: vmovaps %xmm0, 144(%r8)
5354 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5355 ; AVX-NEXT: vmovaps %xmm0, 128(%r8)
5356 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5357 ; AVX-NEXT: vmovaps %xmm0, 304(%r8)
5358 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5359 ; AVX-NEXT: vmovaps %xmm0, 288(%r8)
5360 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5361 ; AVX-NEXT: vmovaps %xmm0, 272(%r8)
5362 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5363 ; AVX-NEXT: vmovaps %xmm0, 256(%r8)
5364 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5365 ; AVX-NEXT: vmovaps %xmm0, 560(%r8)
5366 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5367 ; AVX-NEXT: vmovaps %xmm0, 544(%r8)
5368 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5369 ; AVX-NEXT: vmovaps %xmm0, 528(%r8)
5370 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5371 ; AVX-NEXT: vmovaps %xmm0, 512(%r8)
5372 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5373 ; AVX-NEXT: vmovaps %xmm0, 1072(%r8)
5374 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5375 ; AVX-NEXT: vmovaps %xmm0, 1056(%r8)
5376 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5377 ; AVX-NEXT: vmovaps %xmm0, 1040(%r8)
5378 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5379 ; AVX-NEXT: vmovaps %xmm0, 1024(%r8)
5380 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5381 ; AVX-NEXT: vmovaps %ymm0, 2016(%r8)
5382 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5383 ; AVX-NEXT: vmovaps %ymm0, 1984(%r8)
5384 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5385 ; AVX-NEXT: vmovaps %ymm0, 1888(%r8)
5386 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5387 ; AVX-NEXT: vmovaps %ymm0, 1856(%r8)
5388 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5389 ; AVX-NEXT: vmovaps %ymm0, 1760(%r8)
5390 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5391 ; AVX-NEXT: vmovaps %ymm0, 1728(%r8)
5392 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5393 ; AVX-NEXT: vmovaps %ymm0, 1632(%r8)
5394 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5395 ; AVX-NEXT: vmovaps %ymm0, 1600(%r8)
5396 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5397 ; AVX-NEXT: vmovaps %ymm0, 1504(%r8)
5398 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5399 ; AVX-NEXT: vmovaps %ymm0, 1472(%r8)
5400 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5401 ; AVX-NEXT: vmovaps %ymm0, 1376(%r8)
5402 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5403 ; AVX-NEXT: vmovaps %ymm0, 1344(%r8)
5404 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5405 ; AVX-NEXT: vmovaps %ymm0, 1248(%r8)
5406 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5407 ; AVX-NEXT: vmovaps %ymm0, 1216(%r8)
5408 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5409 ; AVX-NEXT: vmovaps %ymm0, 1120(%r8)
5410 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5411 ; AVX-NEXT: vmovaps %ymm0, 1088(%r8)
5412 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5413 ; AVX-NEXT: vmovaps %ymm0, 992(%r8)
5414 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5415 ; AVX-NEXT: vmovaps %ymm0, 960(%r8)
5416 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5417 ; AVX-NEXT: vmovaps %ymm0, 864(%r8)
5418 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5419 ; AVX-NEXT: vmovaps %ymm0, 832(%r8)
5420 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5421 ; AVX-NEXT: vmovaps %ymm0, 736(%r8)
5422 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5423 ; AVX-NEXT: vmovaps %ymm0, 704(%r8)
5424 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5425 ; AVX-NEXT: vmovaps %ymm0, 608(%r8)
5426 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5427 ; AVX-NEXT: vmovaps %ymm0, 576(%r8)
5428 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5429 ; AVX-NEXT: vmovaps %ymm0, 480(%r8)
5430 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5431 ; AVX-NEXT: vmovaps %ymm0, 448(%r8)
5432 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5433 ; AVX-NEXT: vmovaps %ymm0, 352(%r8)
5434 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5435 ; AVX-NEXT: vmovaps %ymm0, 320(%r8)
5436 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5437 ; AVX-NEXT: vmovaps %ymm0, 224(%r8)
5438 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5439 ; AVX-NEXT: vmovaps %ymm0, 192(%r8)
5440 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5441 ; AVX-NEXT: vmovaps %ymm0, 96(%r8)
5442 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5443 ; AVX-NEXT: vmovaps %ymm0, 64(%r8)
5444 ; AVX-NEXT: addq $1688, %rsp # imm = 0x698
5445 ; AVX-NEXT: vzeroupper
5448 ; AVX2-LABEL: store_i64_stride4_vf64:
5450 ; AVX2-NEXT: subq $1544, %rsp # imm = 0x608
5451 ; AVX2-NEXT: vmovaps (%rsi), %xmm0
5452 ; AVX2-NEXT: vmovaps 32(%rsi), %xmm1
5453 ; AVX2-NEXT: vmovaps 64(%rsi), %xmm2
5454 ; AVX2-NEXT: vinsertf128 $1, (%rcx), %ymm0, %ymm0
5455 ; AVX2-NEXT: vmovaps (%rdi), %xmm3
5456 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm4
5457 ; AVX2-NEXT: vmovaps 64(%rdi), %xmm5
5458 ; AVX2-NEXT: vinsertf128 $1, (%rdx), %ymm3, %ymm3
5459 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
5460 ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5461 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3]
5462 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5463 ; AVX2-NEXT: vinsertf128 $1, 32(%rcx), %ymm1, %ymm0
5464 ; AVX2-NEXT: vinsertf128 $1, 32(%rdx), %ymm4, %ymm1
5465 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5466 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5467 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
5468 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5469 ; AVX2-NEXT: vinsertf128 $1, 64(%rcx), %ymm2, %ymm0
5470 ; AVX2-NEXT: vinsertf128 $1, 64(%rdx), %ymm5, %ymm1
5471 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5472 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5473 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
5474 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5475 ; AVX2-NEXT: vmovaps 96(%rsi), %xmm0
5476 ; AVX2-NEXT: vinsertf128 $1, 96(%rcx), %ymm0, %ymm0
5477 ; AVX2-NEXT: vmovaps 96(%rdi), %xmm1
5478 ; AVX2-NEXT: vinsertf128 $1, 96(%rdx), %ymm1, %ymm1
5479 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5480 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5481 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
5482 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5483 ; AVX2-NEXT: vmovaps 128(%rsi), %xmm0
5484 ; AVX2-NEXT: vinsertf128 $1, 128(%rcx), %ymm0, %ymm0
5485 ; AVX2-NEXT: vmovaps 128(%rdi), %xmm1
5486 ; AVX2-NEXT: vinsertf128 $1, 128(%rdx), %ymm1, %ymm1
5487 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5488 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5489 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
5490 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5491 ; AVX2-NEXT: vmovaps 160(%rsi), %xmm0
5492 ; AVX2-NEXT: vinsertf128 $1, 160(%rcx), %ymm0, %ymm0
5493 ; AVX2-NEXT: vmovaps 160(%rdi), %xmm1
5494 ; AVX2-NEXT: vinsertf128 $1, 160(%rdx), %ymm1, %ymm1
5495 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5496 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5497 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
5498 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5499 ; AVX2-NEXT: vmovaps 192(%rsi), %xmm0
5500 ; AVX2-NEXT: vinsertf128 $1, 192(%rcx), %ymm0, %ymm0
5501 ; AVX2-NEXT: vmovaps 192(%rdi), %xmm1
5502 ; AVX2-NEXT: vinsertf128 $1, 192(%rdx), %ymm1, %ymm1
5503 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5504 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5505 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
5506 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5507 ; AVX2-NEXT: vmovaps 224(%rsi), %xmm0
5508 ; AVX2-NEXT: vinsertf128 $1, 224(%rcx), %ymm0, %ymm0
5509 ; AVX2-NEXT: vmovaps 224(%rdi), %xmm1
5510 ; AVX2-NEXT: vinsertf128 $1, 224(%rdx), %ymm1, %ymm1
5511 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5512 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5513 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
5514 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5515 ; AVX2-NEXT: vmovaps 256(%rsi), %xmm0
5516 ; AVX2-NEXT: vinsertf128 $1, 256(%rcx), %ymm0, %ymm0
5517 ; AVX2-NEXT: vmovaps 256(%rdi), %xmm1
5518 ; AVX2-NEXT: vinsertf128 $1, 256(%rdx), %ymm1, %ymm1
5519 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5520 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5521 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
5522 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5523 ; AVX2-NEXT: vmovaps 288(%rsi), %xmm0
5524 ; AVX2-NEXT: vinsertf128 $1, 288(%rcx), %ymm0, %ymm0
5525 ; AVX2-NEXT: vmovaps 288(%rdi), %xmm1
5526 ; AVX2-NEXT: vinsertf128 $1, 288(%rdx), %ymm1, %ymm1
5527 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5528 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5529 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
5530 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5531 ; AVX2-NEXT: vmovaps 320(%rsi), %xmm0
5532 ; AVX2-NEXT: vinsertf128 $1, 320(%rcx), %ymm0, %ymm0
5533 ; AVX2-NEXT: vmovaps 320(%rdi), %xmm1
5534 ; AVX2-NEXT: vinsertf128 $1, 320(%rdx), %ymm1, %ymm1
5535 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5536 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5537 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
5538 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5539 ; AVX2-NEXT: vmovaps 352(%rsi), %xmm0
5540 ; AVX2-NEXT: vinsertf128 $1, 352(%rcx), %ymm0, %ymm0
5541 ; AVX2-NEXT: vmovaps 352(%rdi), %xmm1
5542 ; AVX2-NEXT: vinsertf128 $1, 352(%rdx), %ymm1, %ymm1
5543 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5544 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5545 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
5546 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5547 ; AVX2-NEXT: vmovaps 384(%rsi), %xmm0
5548 ; AVX2-NEXT: vinsertf128 $1, 384(%rcx), %ymm0, %ymm0
5549 ; AVX2-NEXT: vmovaps 384(%rdi), %xmm1
5550 ; AVX2-NEXT: vinsertf128 $1, 384(%rdx), %ymm1, %ymm1
5551 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5552 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5553 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
5554 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5555 ; AVX2-NEXT: vmovaps 416(%rsi), %xmm0
5556 ; AVX2-NEXT: vinsertf128 $1, 416(%rcx), %ymm0, %ymm0
5557 ; AVX2-NEXT: vmovaps 416(%rdi), %xmm1
5558 ; AVX2-NEXT: vinsertf128 $1, 416(%rdx), %ymm1, %ymm1
5559 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5560 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5561 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
5562 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5563 ; AVX2-NEXT: vmovaps 448(%rsi), %xmm0
5564 ; AVX2-NEXT: vinsertf128 $1, 448(%rcx), %ymm0, %ymm0
5565 ; AVX2-NEXT: vmovaps 448(%rdi), %xmm1
5566 ; AVX2-NEXT: vinsertf128 $1, 448(%rdx), %ymm1, %ymm1
5567 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5568 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5569 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
5570 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5571 ; AVX2-NEXT: vmovaps 480(%rsi), %xmm0
5572 ; AVX2-NEXT: vinsertf128 $1, 480(%rcx), %ymm0, %ymm0
5573 ; AVX2-NEXT: vmovaps 480(%rdi), %xmm1
5574 ; AVX2-NEXT: vinsertf128 $1, 480(%rdx), %ymm1, %ymm1
5575 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5576 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5577 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
5578 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5579 ; AVX2-NEXT: vmovaps (%rdi), %ymm0
5580 ; AVX2-NEXT: vmovaps (%rsi), %ymm1
5581 ; AVX2-NEXT: vmovaps (%rdx), %ymm2
5582 ; AVX2-NEXT: vmovaps (%rcx), %ymm3
5583 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
5584 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
5585 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
5586 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5587 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
5588 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
5589 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
5590 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5591 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm0
5592 ; AVX2-NEXT: vmovaps 32(%rsi), %ymm1
5593 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm2
5594 ; AVX2-NEXT: vmovaps 32(%rcx), %ymm3
5595 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
5596 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
5597 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
5598 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5599 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
5600 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
5601 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
5602 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5603 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm0
5604 ; AVX2-NEXT: vmovaps 64(%rsi), %ymm1
5605 ; AVX2-NEXT: vmovaps 64(%rdx), %ymm2
5606 ; AVX2-NEXT: vmovaps 64(%rcx), %ymm3
5607 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
5608 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
5609 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
5610 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5611 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
5612 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
5613 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
5614 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5615 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm0
5616 ; AVX2-NEXT: vmovaps 96(%rsi), %ymm1
5617 ; AVX2-NEXT: vmovaps 96(%rdx), %ymm2
5618 ; AVX2-NEXT: vmovaps 96(%rcx), %ymm3
5619 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
5620 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
5621 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
5622 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5623 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
5624 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
5625 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
5626 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5627 ; AVX2-NEXT: vmovaps 128(%rdi), %ymm0
5628 ; AVX2-NEXT: vmovaps 128(%rsi), %ymm1
5629 ; AVX2-NEXT: vmovaps 128(%rdx), %ymm2
5630 ; AVX2-NEXT: vmovaps 128(%rcx), %ymm3
5631 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
5632 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
5633 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
5634 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5635 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
5636 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
5637 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
5638 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5639 ; AVX2-NEXT: vmovaps 160(%rdi), %ymm0
5640 ; AVX2-NEXT: vmovaps 160(%rsi), %ymm1
5641 ; AVX2-NEXT: vmovaps 160(%rdx), %ymm2
5642 ; AVX2-NEXT: vmovaps 160(%rcx), %ymm3
5643 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
5644 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
5645 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
5646 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5647 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
5648 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
5649 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
5650 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5651 ; AVX2-NEXT: vmovaps 192(%rdi), %ymm0
5652 ; AVX2-NEXT: vmovaps 192(%rsi), %ymm1
5653 ; AVX2-NEXT: vmovaps 192(%rdx), %ymm2
5654 ; AVX2-NEXT: vmovaps 192(%rcx), %ymm3
5655 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
5656 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
5657 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
5658 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5659 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
5660 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
5661 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
5662 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5663 ; AVX2-NEXT: vmovaps 224(%rdi), %ymm0
5664 ; AVX2-NEXT: vmovaps 224(%rsi), %ymm1
5665 ; AVX2-NEXT: vmovaps 224(%rdx), %ymm2
5666 ; AVX2-NEXT: vmovaps 224(%rcx), %ymm3
5667 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
5668 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
5669 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
5670 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5671 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
5672 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
5673 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
5674 ; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
5675 ; AVX2-NEXT: vmovaps 256(%rdi), %ymm0
5676 ; AVX2-NEXT: vmovaps 256(%rsi), %ymm1
5677 ; AVX2-NEXT: vmovaps 256(%rdx), %ymm2
5678 ; AVX2-NEXT: vmovaps 256(%rcx), %ymm3
5679 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
5680 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
5681 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
5682 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5683 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
5684 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
5685 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
5686 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5687 ; AVX2-NEXT: vmovaps 288(%rdi), %ymm0
5688 ; AVX2-NEXT: vmovaps 288(%rsi), %ymm1
5689 ; AVX2-NEXT: vmovaps 288(%rdx), %ymm2
5690 ; AVX2-NEXT: vmovaps 288(%rcx), %ymm3
5691 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
5692 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
5693 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
5694 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5695 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
5696 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
5697 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
5698 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5699 ; AVX2-NEXT: vmovaps 320(%rdi), %ymm0
5700 ; AVX2-NEXT: vmovaps 320(%rsi), %ymm1
5701 ; AVX2-NEXT: vmovaps 320(%rdx), %ymm2
5702 ; AVX2-NEXT: vmovaps 320(%rcx), %ymm3
5703 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
5704 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
5705 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm5[2,3],ymm4[2,3]
5706 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
5707 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
5708 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3],ymm2[2,3]
5709 ; AVX2-NEXT: vmovaps 352(%rdi), %ymm3
5710 ; AVX2-NEXT: vmovaps 352(%rsi), %ymm1
5711 ; AVX2-NEXT: vmovaps 352(%rdx), %ymm2
5712 ; AVX2-NEXT: vmovaps 352(%rcx), %ymm0
5713 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
5714 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
5715 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm15[2,3],ymm4[2,3]
5716 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
5717 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
5718 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm1[2,3],ymm0[2,3]
5719 ; AVX2-NEXT: vmovaps 384(%rdi), %ymm1
5720 ; AVX2-NEXT: vmovaps 384(%rsi), %ymm3
5721 ; AVX2-NEXT: vmovaps 384(%rdx), %ymm15
5722 ; AVX2-NEXT: vmovaps 384(%rcx), %ymm0
5723 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm15[0],ymm0[0],ymm15[2],ymm0[2]
5724 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
5725 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm14[2,3],ymm2[2,3]
5726 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3]
5727 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3]
5728 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3],ymm0[2,3]
5729 ; AVX2-NEXT: vmovaps 416(%rdi), %ymm1
5730 ; AVX2-NEXT: vmovaps 416(%rsi), %ymm3
5731 ; AVX2-NEXT: vmovaps 416(%rdx), %ymm14
5732 ; AVX2-NEXT: vmovaps 416(%rcx), %ymm15
5733 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2]
5734 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
5735 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm13[2,3],ymm0[2,3]
5736 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm14[1],ymm15[1],ymm14[3],ymm15[3]
5737 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3]
5738 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm13[2,3]
5739 ; AVX2-NEXT: vmovaps 448(%rdi), %ymm3
5740 ; AVX2-NEXT: vmovaps 448(%rsi), %ymm13
5741 ; AVX2-NEXT: vmovaps 448(%rdx), %ymm14
5742 ; AVX2-NEXT: vmovaps 448(%rcx), %ymm15
5743 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2]
5744 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm13[0],ymm3[2],ymm13[2]
5745 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3],ymm0[2,3]
5746 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm15[1],ymm14[3],ymm15[3]
5747 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm13[1],ymm3[3],ymm13[3]
5748 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm12[2,3]
5749 ; AVX2-NEXT: vmovaps 480(%rdi), %ymm12
5750 ; AVX2-NEXT: vmovaps 480(%rsi), %ymm13
5751 ; AVX2-NEXT: vmovaps 480(%rdx), %ymm14
5752 ; AVX2-NEXT: vmovaps 480(%rcx), %ymm15
5753 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2]
5754 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm12[0],ymm13[0],ymm12[2],ymm13[2]
5755 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3]
5756 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm14[1],ymm15[1],ymm14[3],ymm15[3]
5757 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm13[1],ymm12[3],ymm13[3]
5758 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
5759 ; AVX2-NEXT: vmovaps %ymm11, 2016(%r8)
5760 ; AVX2-NEXT: vmovaps %ymm0, 1984(%r8)
5761 ; AVX2-NEXT: vmovaps %ymm3, 1888(%r8)
5762 ; AVX2-NEXT: vmovaps %ymm1, 1856(%r8)
5763 ; AVX2-NEXT: vmovaps %ymm2, 1760(%r8)
5764 ; AVX2-NEXT: vmovaps %ymm4, 1728(%r8)
5765 ; AVX2-NEXT: vmovaps %ymm5, 1632(%r8)
5766 ; AVX2-NEXT: vmovaps %ymm6, 1600(%r8)
5767 ; AVX2-NEXT: vmovaps %ymm7, 1504(%r8)
5768 ; AVX2-NEXT: vmovaps %ymm8, 1472(%r8)
5769 ; AVX2-NEXT: vmovaps %ymm9, 1376(%r8)
5770 ; AVX2-NEXT: vmovaps %ymm10, 1344(%r8)
5771 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5772 ; AVX2-NEXT: vmovaps %ymm0, 1248(%r8)
5773 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5774 ; AVX2-NEXT: vmovaps %ymm0, 1216(%r8)
5775 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5776 ; AVX2-NEXT: vmovaps %ymm0, 1120(%r8)
5777 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5778 ; AVX2-NEXT: vmovaps %ymm0, 1088(%r8)
5779 ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
5780 ; AVX2-NEXT: vmovaps %ymm0, 992(%r8)
5781 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5782 ; AVX2-NEXT: vmovaps %ymm0, 960(%r8)
5783 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5784 ; AVX2-NEXT: vmovaps %ymm0, 864(%r8)
5785 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5786 ; AVX2-NEXT: vmovaps %ymm0, 832(%r8)
5787 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5788 ; AVX2-NEXT: vmovaps %ymm0, 736(%r8)
5789 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5790 ; AVX2-NEXT: vmovaps %ymm0, 704(%r8)
5791 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5792 ; AVX2-NEXT: vmovaps %ymm0, 608(%r8)
5793 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5794 ; AVX2-NEXT: vmovaps %ymm0, 576(%r8)
5795 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5796 ; AVX2-NEXT: vmovaps %ymm0, 480(%r8)
5797 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5798 ; AVX2-NEXT: vmovaps %ymm0, 448(%r8)
5799 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5800 ; AVX2-NEXT: vmovaps %ymm0, 352(%r8)
5801 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5802 ; AVX2-NEXT: vmovaps %ymm0, 320(%r8)
5803 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5804 ; AVX2-NEXT: vmovaps %ymm0, 224(%r8)
5805 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5806 ; AVX2-NEXT: vmovaps %ymm0, 192(%r8)
5807 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5808 ; AVX2-NEXT: vmovaps %ymm0, 96(%r8)
5809 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5810 ; AVX2-NEXT: vmovaps %ymm0, 64(%r8)
5811 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5812 ; AVX2-NEXT: vmovaps %ymm0, 1952(%r8)
5813 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5814 ; AVX2-NEXT: vmovaps %ymm0, 1920(%r8)
5815 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5816 ; AVX2-NEXT: vmovaps %ymm0, 1824(%r8)
5817 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5818 ; AVX2-NEXT: vmovaps %ymm0, 1792(%r8)
5819 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5820 ; AVX2-NEXT: vmovaps %ymm0, 1696(%r8)
5821 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5822 ; AVX2-NEXT: vmovaps %ymm0, 1664(%r8)
5823 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5824 ; AVX2-NEXT: vmovaps %ymm0, 1568(%r8)
5825 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5826 ; AVX2-NEXT: vmovaps %ymm0, 1536(%r8)
5827 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5828 ; AVX2-NEXT: vmovaps %ymm0, 1440(%r8)
5829 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5830 ; AVX2-NEXT: vmovaps %ymm0, 1408(%r8)
5831 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5832 ; AVX2-NEXT: vmovaps %ymm0, 1312(%r8)
5833 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5834 ; AVX2-NEXT: vmovaps %ymm0, 1280(%r8)
5835 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5836 ; AVX2-NEXT: vmovaps %ymm0, 1184(%r8)
5837 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5838 ; AVX2-NEXT: vmovaps %ymm0, 1152(%r8)
5839 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5840 ; AVX2-NEXT: vmovaps %ymm0, 1056(%r8)
5841 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5842 ; AVX2-NEXT: vmovaps %ymm0, 1024(%r8)
5843 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5844 ; AVX2-NEXT: vmovaps %ymm0, 928(%r8)
5845 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5846 ; AVX2-NEXT: vmovaps %ymm0, 896(%r8)
5847 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5848 ; AVX2-NEXT: vmovaps %ymm0, 800(%r8)
5849 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5850 ; AVX2-NEXT: vmovaps %ymm0, 768(%r8)
5851 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5852 ; AVX2-NEXT: vmovaps %ymm0, 672(%r8)
5853 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5854 ; AVX2-NEXT: vmovaps %ymm0, 640(%r8)
5855 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5856 ; AVX2-NEXT: vmovaps %ymm0, 544(%r8)
5857 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5858 ; AVX2-NEXT: vmovaps %ymm0, 512(%r8)
5859 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5860 ; AVX2-NEXT: vmovaps %ymm0, 416(%r8)
5861 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5862 ; AVX2-NEXT: vmovaps %ymm0, 384(%r8)
5863 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5864 ; AVX2-NEXT: vmovaps %ymm0, 288(%r8)
5865 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5866 ; AVX2-NEXT: vmovaps %ymm0, 256(%r8)
5867 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5868 ; AVX2-NEXT: vmovaps %ymm0, 160(%r8)
5869 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5870 ; AVX2-NEXT: vmovaps %ymm0, 128(%r8)
5871 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5872 ; AVX2-NEXT: vmovaps %ymm0, 32(%r8)
5873 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5874 ; AVX2-NEXT: vmovaps %ymm0, (%r8)
5875 ; AVX2-NEXT: addq $1544, %rsp # imm = 0x608
5876 ; AVX2-NEXT: vzeroupper
5879 ; AVX2-FP-LABEL: store_i64_stride4_vf64:
5881 ; AVX2-FP-NEXT: subq $1544, %rsp # imm = 0x608
5882 ; AVX2-FP-NEXT: vmovaps (%rsi), %xmm0
5883 ; AVX2-FP-NEXT: vmovaps 32(%rsi), %xmm1
5884 ; AVX2-FP-NEXT: vmovaps 64(%rsi), %xmm2
5885 ; AVX2-FP-NEXT: vinsertf128 $1, (%rcx), %ymm0, %ymm0
5886 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm3
5887 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm4
5888 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm5
5889 ; AVX2-FP-NEXT: vinsertf128 $1, (%rdx), %ymm3, %ymm3
5890 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
5891 ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5892 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3]
5893 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5894 ; AVX2-FP-NEXT: vinsertf128 $1, 32(%rcx), %ymm1, %ymm0
5895 ; AVX2-FP-NEXT: vinsertf128 $1, 32(%rdx), %ymm4, %ymm1
5896 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5897 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5898 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
5899 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5900 ; AVX2-FP-NEXT: vinsertf128 $1, 64(%rcx), %ymm2, %ymm0
5901 ; AVX2-FP-NEXT: vinsertf128 $1, 64(%rdx), %ymm5, %ymm1
5902 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5903 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5904 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
5905 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5906 ; AVX2-FP-NEXT: vmovaps 96(%rsi), %xmm0
5907 ; AVX2-FP-NEXT: vinsertf128 $1, 96(%rcx), %ymm0, %ymm0
5908 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm1
5909 ; AVX2-FP-NEXT: vinsertf128 $1, 96(%rdx), %ymm1, %ymm1
5910 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5911 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5912 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
5913 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5914 ; AVX2-FP-NEXT: vmovaps 128(%rsi), %xmm0
5915 ; AVX2-FP-NEXT: vinsertf128 $1, 128(%rcx), %ymm0, %ymm0
5916 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %xmm1
5917 ; AVX2-FP-NEXT: vinsertf128 $1, 128(%rdx), %ymm1, %ymm1
5918 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5919 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5920 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
5921 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5922 ; AVX2-FP-NEXT: vmovaps 160(%rsi), %xmm0
5923 ; AVX2-FP-NEXT: vinsertf128 $1, 160(%rcx), %ymm0, %ymm0
5924 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %xmm1
5925 ; AVX2-FP-NEXT: vinsertf128 $1, 160(%rdx), %ymm1, %ymm1
5926 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5927 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5928 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
5929 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5930 ; AVX2-FP-NEXT: vmovaps 192(%rsi), %xmm0
5931 ; AVX2-FP-NEXT: vinsertf128 $1, 192(%rcx), %ymm0, %ymm0
5932 ; AVX2-FP-NEXT: vmovaps 192(%rdi), %xmm1
5933 ; AVX2-FP-NEXT: vinsertf128 $1, 192(%rdx), %ymm1, %ymm1
5934 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5935 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5936 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
5937 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5938 ; AVX2-FP-NEXT: vmovaps 224(%rsi), %xmm0
5939 ; AVX2-FP-NEXT: vinsertf128 $1, 224(%rcx), %ymm0, %ymm0
5940 ; AVX2-FP-NEXT: vmovaps 224(%rdi), %xmm1
5941 ; AVX2-FP-NEXT: vinsertf128 $1, 224(%rdx), %ymm1, %ymm1
5942 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5943 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5944 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
5945 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5946 ; AVX2-FP-NEXT: vmovaps 256(%rsi), %xmm0
5947 ; AVX2-FP-NEXT: vinsertf128 $1, 256(%rcx), %ymm0, %ymm0
5948 ; AVX2-FP-NEXT: vmovaps 256(%rdi), %xmm1
5949 ; AVX2-FP-NEXT: vinsertf128 $1, 256(%rdx), %ymm1, %ymm1
5950 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5951 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5952 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
5953 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5954 ; AVX2-FP-NEXT: vmovaps 288(%rsi), %xmm0
5955 ; AVX2-FP-NEXT: vinsertf128 $1, 288(%rcx), %ymm0, %ymm0
5956 ; AVX2-FP-NEXT: vmovaps 288(%rdi), %xmm1
5957 ; AVX2-FP-NEXT: vinsertf128 $1, 288(%rdx), %ymm1, %ymm1
5958 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5959 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5960 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
5961 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5962 ; AVX2-FP-NEXT: vmovaps 320(%rsi), %xmm0
5963 ; AVX2-FP-NEXT: vinsertf128 $1, 320(%rcx), %ymm0, %ymm0
5964 ; AVX2-FP-NEXT: vmovaps 320(%rdi), %xmm1
5965 ; AVX2-FP-NEXT: vinsertf128 $1, 320(%rdx), %ymm1, %ymm1
5966 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5967 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5968 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
5969 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5970 ; AVX2-FP-NEXT: vmovaps 352(%rsi), %xmm0
5971 ; AVX2-FP-NEXT: vinsertf128 $1, 352(%rcx), %ymm0, %ymm0
5972 ; AVX2-FP-NEXT: vmovaps 352(%rdi), %xmm1
5973 ; AVX2-FP-NEXT: vinsertf128 $1, 352(%rdx), %ymm1, %ymm1
5974 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5975 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5976 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
5977 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5978 ; AVX2-FP-NEXT: vmovaps 384(%rsi), %xmm0
5979 ; AVX2-FP-NEXT: vinsertf128 $1, 384(%rcx), %ymm0, %ymm0
5980 ; AVX2-FP-NEXT: vmovaps 384(%rdi), %xmm1
5981 ; AVX2-FP-NEXT: vinsertf128 $1, 384(%rdx), %ymm1, %ymm1
5982 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5983 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5984 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
5985 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5986 ; AVX2-FP-NEXT: vmovaps 416(%rsi), %xmm0
5987 ; AVX2-FP-NEXT: vinsertf128 $1, 416(%rcx), %ymm0, %ymm0
5988 ; AVX2-FP-NEXT: vmovaps 416(%rdi), %xmm1
5989 ; AVX2-FP-NEXT: vinsertf128 $1, 416(%rdx), %ymm1, %ymm1
5990 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5991 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5992 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
5993 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5994 ; AVX2-FP-NEXT: vmovaps 448(%rsi), %xmm0
5995 ; AVX2-FP-NEXT: vinsertf128 $1, 448(%rcx), %ymm0, %ymm0
5996 ; AVX2-FP-NEXT: vmovaps 448(%rdi), %xmm1
5997 ; AVX2-FP-NEXT: vinsertf128 $1, 448(%rdx), %ymm1, %ymm1
5998 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5999 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6000 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
6001 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6002 ; AVX2-FP-NEXT: vmovaps 480(%rsi), %xmm0
6003 ; AVX2-FP-NEXT: vinsertf128 $1, 480(%rcx), %ymm0, %ymm0
6004 ; AVX2-FP-NEXT: vmovaps 480(%rdi), %xmm1
6005 ; AVX2-FP-NEXT: vinsertf128 $1, 480(%rdx), %ymm1, %ymm1
6006 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
6007 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6008 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
6009 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6010 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0
6011 ; AVX2-FP-NEXT: vmovaps (%rsi), %ymm1
6012 ; AVX2-FP-NEXT: vmovaps (%rdx), %ymm2
6013 ; AVX2-FP-NEXT: vmovaps (%rcx), %ymm3
6014 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
6015 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
6016 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
6017 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6018 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
6019 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
6020 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
6021 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6022 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm0
6023 ; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm1
6024 ; AVX2-FP-NEXT: vmovaps 32(%rdx), %ymm2
6025 ; AVX2-FP-NEXT: vmovaps 32(%rcx), %ymm3
6026 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
6027 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
6028 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
6029 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6030 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
6031 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
6032 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
6033 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6034 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm0
6035 ; AVX2-FP-NEXT: vmovaps 64(%rsi), %ymm1
6036 ; AVX2-FP-NEXT: vmovaps 64(%rdx), %ymm2
6037 ; AVX2-FP-NEXT: vmovaps 64(%rcx), %ymm3
6038 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
6039 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
6040 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
6041 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6042 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
6043 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
6044 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
6045 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6046 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm0
6047 ; AVX2-FP-NEXT: vmovaps 96(%rsi), %ymm1
6048 ; AVX2-FP-NEXT: vmovaps 96(%rdx), %ymm2
6049 ; AVX2-FP-NEXT: vmovaps 96(%rcx), %ymm3
6050 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
6051 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
6052 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
6053 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6054 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
6055 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
6056 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
6057 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6058 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm0
6059 ; AVX2-FP-NEXT: vmovaps 128(%rsi), %ymm1
6060 ; AVX2-FP-NEXT: vmovaps 128(%rdx), %ymm2
6061 ; AVX2-FP-NEXT: vmovaps 128(%rcx), %ymm3
6062 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
6063 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
6064 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
6065 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6066 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
6067 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
6068 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
6069 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6070 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm0
6071 ; AVX2-FP-NEXT: vmovaps 160(%rsi), %ymm1
6072 ; AVX2-FP-NEXT: vmovaps 160(%rdx), %ymm2
6073 ; AVX2-FP-NEXT: vmovaps 160(%rcx), %ymm3
6074 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
6075 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
6076 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
6077 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6078 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
6079 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
6080 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
6081 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6082 ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm0
6083 ; AVX2-FP-NEXT: vmovaps 192(%rsi), %ymm1
6084 ; AVX2-FP-NEXT: vmovaps 192(%rdx), %ymm2
6085 ; AVX2-FP-NEXT: vmovaps 192(%rcx), %ymm3
6086 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
6087 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
6088 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
6089 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6090 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
6091 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
6092 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
6093 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6094 ; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm0
6095 ; AVX2-FP-NEXT: vmovaps 224(%rsi), %ymm1
6096 ; AVX2-FP-NEXT: vmovaps 224(%rdx), %ymm2
6097 ; AVX2-FP-NEXT: vmovaps 224(%rcx), %ymm3
6098 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
6099 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
6100 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
6101 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6102 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
6103 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
6104 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
6105 ; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
6106 ; AVX2-FP-NEXT: vmovaps 256(%rdi), %ymm0
6107 ; AVX2-FP-NEXT: vmovaps 256(%rsi), %ymm1
6108 ; AVX2-FP-NEXT: vmovaps 256(%rdx), %ymm2
6109 ; AVX2-FP-NEXT: vmovaps 256(%rcx), %ymm3
6110 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
6111 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
6112 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
6113 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6114 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
6115 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
6116 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
6117 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6118 ; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm0
6119 ; AVX2-FP-NEXT: vmovaps 288(%rsi), %ymm1
6120 ; AVX2-FP-NEXT: vmovaps 288(%rdx), %ymm2
6121 ; AVX2-FP-NEXT: vmovaps 288(%rcx), %ymm3
6122 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
6123 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
6124 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
6125 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6126 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
6127 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
6128 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
6129 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6130 ; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm0
6131 ; AVX2-FP-NEXT: vmovaps 320(%rsi), %ymm1
6132 ; AVX2-FP-NEXT: vmovaps 320(%rdx), %ymm2
6133 ; AVX2-FP-NEXT: vmovaps 320(%rcx), %ymm3
6134 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
6135 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
6136 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm5[2,3],ymm4[2,3]
6137 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
6138 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
6139 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3],ymm2[2,3]
6140 ; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm3
6141 ; AVX2-FP-NEXT: vmovaps 352(%rsi), %ymm1
6142 ; AVX2-FP-NEXT: vmovaps 352(%rdx), %ymm2
6143 ; AVX2-FP-NEXT: vmovaps 352(%rcx), %ymm0
6144 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
6145 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
6146 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm15[2,3],ymm4[2,3]
6147 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
6148 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
6149 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm1[2,3],ymm0[2,3]
6150 ; AVX2-FP-NEXT: vmovaps 384(%rdi), %ymm1
6151 ; AVX2-FP-NEXT: vmovaps 384(%rsi), %ymm3
6152 ; AVX2-FP-NEXT: vmovaps 384(%rdx), %ymm15
6153 ; AVX2-FP-NEXT: vmovaps 384(%rcx), %ymm0
6154 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm15[0],ymm0[0],ymm15[2],ymm0[2]
6155 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
6156 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm14[2,3],ymm2[2,3]
6157 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3]
6158 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3]
6159 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3],ymm0[2,3]
6160 ; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm1
6161 ; AVX2-FP-NEXT: vmovaps 416(%rsi), %ymm3
6162 ; AVX2-FP-NEXT: vmovaps 416(%rdx), %ymm14
6163 ; AVX2-FP-NEXT: vmovaps 416(%rcx), %ymm15
6164 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2]
6165 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
6166 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm13[2,3],ymm0[2,3]
6167 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm14[1],ymm15[1],ymm14[3],ymm15[3]
6168 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3]
6169 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm13[2,3]
6170 ; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm3
6171 ; AVX2-FP-NEXT: vmovaps 448(%rsi), %ymm13
6172 ; AVX2-FP-NEXT: vmovaps 448(%rdx), %ymm14
6173 ; AVX2-FP-NEXT: vmovaps 448(%rcx), %ymm15
6174 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2]
6175 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm13[0],ymm3[2],ymm13[2]
6176 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3],ymm0[2,3]
6177 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm15[1],ymm14[3],ymm15[3]
6178 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm13[1],ymm3[3],ymm13[3]
6179 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm12[2,3]
6180 ; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm12
6181 ; AVX2-FP-NEXT: vmovaps 480(%rsi), %ymm13
6182 ; AVX2-FP-NEXT: vmovaps 480(%rdx), %ymm14
6183 ; AVX2-FP-NEXT: vmovaps 480(%rcx), %ymm15
6184 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2]
6185 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm12[0],ymm13[0],ymm12[2],ymm13[2]
6186 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3]
6187 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm14[1],ymm15[1],ymm14[3],ymm15[3]
6188 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm13[1],ymm12[3],ymm13[3]
6189 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
6190 ; AVX2-FP-NEXT: vmovaps %ymm11, 2016(%r8)
6191 ; AVX2-FP-NEXT: vmovaps %ymm0, 1984(%r8)
6192 ; AVX2-FP-NEXT: vmovaps %ymm3, 1888(%r8)
6193 ; AVX2-FP-NEXT: vmovaps %ymm1, 1856(%r8)
6194 ; AVX2-FP-NEXT: vmovaps %ymm2, 1760(%r8)
6195 ; AVX2-FP-NEXT: vmovaps %ymm4, 1728(%r8)
6196 ; AVX2-FP-NEXT: vmovaps %ymm5, 1632(%r8)
6197 ; AVX2-FP-NEXT: vmovaps %ymm6, 1600(%r8)
6198 ; AVX2-FP-NEXT: vmovaps %ymm7, 1504(%r8)
6199 ; AVX2-FP-NEXT: vmovaps %ymm8, 1472(%r8)
6200 ; AVX2-FP-NEXT: vmovaps %ymm9, 1376(%r8)
6201 ; AVX2-FP-NEXT: vmovaps %ymm10, 1344(%r8)
6202 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6203 ; AVX2-FP-NEXT: vmovaps %ymm0, 1248(%r8)
6204 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6205 ; AVX2-FP-NEXT: vmovaps %ymm0, 1216(%r8)
6206 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6207 ; AVX2-FP-NEXT: vmovaps %ymm0, 1120(%r8)
6208 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6209 ; AVX2-FP-NEXT: vmovaps %ymm0, 1088(%r8)
6210 ; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
6211 ; AVX2-FP-NEXT: vmovaps %ymm0, 992(%r8)
6212 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6213 ; AVX2-FP-NEXT: vmovaps %ymm0, 960(%r8)
6214 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6215 ; AVX2-FP-NEXT: vmovaps %ymm0, 864(%r8)
6216 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6217 ; AVX2-FP-NEXT: vmovaps %ymm0, 832(%r8)
6218 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6219 ; AVX2-FP-NEXT: vmovaps %ymm0, 736(%r8)
6220 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6221 ; AVX2-FP-NEXT: vmovaps %ymm0, 704(%r8)
6222 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6223 ; AVX2-FP-NEXT: vmovaps %ymm0, 608(%r8)
6224 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6225 ; AVX2-FP-NEXT: vmovaps %ymm0, 576(%r8)
6226 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6227 ; AVX2-FP-NEXT: vmovaps %ymm0, 480(%r8)
6228 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6229 ; AVX2-FP-NEXT: vmovaps %ymm0, 448(%r8)
6230 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6231 ; AVX2-FP-NEXT: vmovaps %ymm0, 352(%r8)
6232 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6233 ; AVX2-FP-NEXT: vmovaps %ymm0, 320(%r8)
6234 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6235 ; AVX2-FP-NEXT: vmovaps %ymm0, 224(%r8)
6236 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6237 ; AVX2-FP-NEXT: vmovaps %ymm0, 192(%r8)
6238 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6239 ; AVX2-FP-NEXT: vmovaps %ymm0, 96(%r8)
6240 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6241 ; AVX2-FP-NEXT: vmovaps %ymm0, 64(%r8)
6242 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6243 ; AVX2-FP-NEXT: vmovaps %ymm0, 1952(%r8)
6244 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6245 ; AVX2-FP-NEXT: vmovaps %ymm0, 1920(%r8)
6246 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6247 ; AVX2-FP-NEXT: vmovaps %ymm0, 1824(%r8)
6248 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6249 ; AVX2-FP-NEXT: vmovaps %ymm0, 1792(%r8)
6250 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6251 ; AVX2-FP-NEXT: vmovaps %ymm0, 1696(%r8)
6252 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6253 ; AVX2-FP-NEXT: vmovaps %ymm0, 1664(%r8)
6254 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6255 ; AVX2-FP-NEXT: vmovaps %ymm0, 1568(%r8)
6256 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6257 ; AVX2-FP-NEXT: vmovaps %ymm0, 1536(%r8)
6258 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6259 ; AVX2-FP-NEXT: vmovaps %ymm0, 1440(%r8)
6260 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6261 ; AVX2-FP-NEXT: vmovaps %ymm0, 1408(%r8)
6262 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6263 ; AVX2-FP-NEXT: vmovaps %ymm0, 1312(%r8)
6264 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6265 ; AVX2-FP-NEXT: vmovaps %ymm0, 1280(%r8)
6266 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6267 ; AVX2-FP-NEXT: vmovaps %ymm0, 1184(%r8)
6268 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6269 ; AVX2-FP-NEXT: vmovaps %ymm0, 1152(%r8)
6270 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6271 ; AVX2-FP-NEXT: vmovaps %ymm0, 1056(%r8)
6272 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6273 ; AVX2-FP-NEXT: vmovaps %ymm0, 1024(%r8)
6274 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6275 ; AVX2-FP-NEXT: vmovaps %ymm0, 928(%r8)
6276 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6277 ; AVX2-FP-NEXT: vmovaps %ymm0, 896(%r8)
6278 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6279 ; AVX2-FP-NEXT: vmovaps %ymm0, 800(%r8)
6280 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6281 ; AVX2-FP-NEXT: vmovaps %ymm0, 768(%r8)
6282 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6283 ; AVX2-FP-NEXT: vmovaps %ymm0, 672(%r8)
6284 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6285 ; AVX2-FP-NEXT: vmovaps %ymm0, 640(%r8)
6286 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6287 ; AVX2-FP-NEXT: vmovaps %ymm0, 544(%r8)
6288 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6289 ; AVX2-FP-NEXT: vmovaps %ymm0, 512(%r8)
6290 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6291 ; AVX2-FP-NEXT: vmovaps %ymm0, 416(%r8)
6292 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6293 ; AVX2-FP-NEXT: vmovaps %ymm0, 384(%r8)
6294 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6295 ; AVX2-FP-NEXT: vmovaps %ymm0, 288(%r8)
6296 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6297 ; AVX2-FP-NEXT: vmovaps %ymm0, 256(%r8)
6298 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6299 ; AVX2-FP-NEXT: vmovaps %ymm0, 160(%r8)
6300 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6301 ; AVX2-FP-NEXT: vmovaps %ymm0, 128(%r8)
6302 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6303 ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%r8)
6304 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6305 ; AVX2-FP-NEXT: vmovaps %ymm0, (%r8)
6306 ; AVX2-FP-NEXT: addq $1544, %rsp # imm = 0x608
6307 ; AVX2-FP-NEXT: vzeroupper
6308 ; AVX2-FP-NEXT: retq
6310 ; AVX2-FCP-LABEL: store_i64_stride4_vf64:
6311 ; AVX2-FCP: # %bb.0:
6312 ; AVX2-FCP-NEXT: subq $1544, %rsp # imm = 0x608
6313 ; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm0
6314 ; AVX2-FCP-NEXT: vmovaps 32(%rsi), %xmm1
6315 ; AVX2-FCP-NEXT: vmovaps 64(%rsi), %xmm2
6316 ; AVX2-FCP-NEXT: vinsertf128 $1, (%rcx), %ymm0, %ymm0
6317 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm3
6318 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm4
6319 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm5
6320 ; AVX2-FCP-NEXT: vinsertf128 $1, (%rdx), %ymm3, %ymm3
6321 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
6322 ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6323 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3]
6324 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6325 ; AVX2-FCP-NEXT: vinsertf128 $1, 32(%rcx), %ymm1, %ymm0
6326 ; AVX2-FCP-NEXT: vinsertf128 $1, 32(%rdx), %ymm4, %ymm1
6327 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
6328 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6329 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
6330 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6331 ; AVX2-FCP-NEXT: vinsertf128 $1, 64(%rcx), %ymm2, %ymm0
6332 ; AVX2-FCP-NEXT: vinsertf128 $1, 64(%rdx), %ymm5, %ymm1
6333 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
6334 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6335 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
6336 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6337 ; AVX2-FCP-NEXT: vmovaps 96(%rsi), %xmm0
6338 ; AVX2-FCP-NEXT: vinsertf128 $1, 96(%rcx), %ymm0, %ymm0
6339 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm1
6340 ; AVX2-FCP-NEXT: vinsertf128 $1, 96(%rdx), %ymm1, %ymm1
6341 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
6342 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6343 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
6344 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6345 ; AVX2-FCP-NEXT: vmovaps 128(%rsi), %xmm0
6346 ; AVX2-FCP-NEXT: vinsertf128 $1, 128(%rcx), %ymm0, %ymm0
6347 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %xmm1
6348 ; AVX2-FCP-NEXT: vinsertf128 $1, 128(%rdx), %ymm1, %ymm1
6349 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
6350 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6351 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
6352 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6353 ; AVX2-FCP-NEXT: vmovaps 160(%rsi), %xmm0
6354 ; AVX2-FCP-NEXT: vinsertf128 $1, 160(%rcx), %ymm0, %ymm0
6355 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %xmm1
6356 ; AVX2-FCP-NEXT: vinsertf128 $1, 160(%rdx), %ymm1, %ymm1
6357 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
6358 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6359 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
6360 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6361 ; AVX2-FCP-NEXT: vmovaps 192(%rsi), %xmm0
6362 ; AVX2-FCP-NEXT: vinsertf128 $1, 192(%rcx), %ymm0, %ymm0
6363 ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %xmm1
6364 ; AVX2-FCP-NEXT: vinsertf128 $1, 192(%rdx), %ymm1, %ymm1
6365 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
6366 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6367 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
6368 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6369 ; AVX2-FCP-NEXT: vmovaps 224(%rsi), %xmm0
6370 ; AVX2-FCP-NEXT: vinsertf128 $1, 224(%rcx), %ymm0, %ymm0
6371 ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %xmm1
6372 ; AVX2-FCP-NEXT: vinsertf128 $1, 224(%rdx), %ymm1, %ymm1
6373 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
6374 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6375 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
6376 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6377 ; AVX2-FCP-NEXT: vmovaps 256(%rsi), %xmm0
6378 ; AVX2-FCP-NEXT: vinsertf128 $1, 256(%rcx), %ymm0, %ymm0
6379 ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %xmm1
6380 ; AVX2-FCP-NEXT: vinsertf128 $1, 256(%rdx), %ymm1, %ymm1
6381 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
6382 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6383 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
6384 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6385 ; AVX2-FCP-NEXT: vmovaps 288(%rsi), %xmm0
6386 ; AVX2-FCP-NEXT: vinsertf128 $1, 288(%rcx), %ymm0, %ymm0
6387 ; AVX2-FCP-NEXT: vmovaps 288(%rdi), %xmm1
6388 ; AVX2-FCP-NEXT: vinsertf128 $1, 288(%rdx), %ymm1, %ymm1
6389 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
6390 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6391 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
6392 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6393 ; AVX2-FCP-NEXT: vmovaps 320(%rsi), %xmm0
6394 ; AVX2-FCP-NEXT: vinsertf128 $1, 320(%rcx), %ymm0, %ymm0
6395 ; AVX2-FCP-NEXT: vmovaps 320(%rdi), %xmm1
6396 ; AVX2-FCP-NEXT: vinsertf128 $1, 320(%rdx), %ymm1, %ymm1
6397 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
6398 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6399 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
6400 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6401 ; AVX2-FCP-NEXT: vmovaps 352(%rsi), %xmm0
6402 ; AVX2-FCP-NEXT: vinsertf128 $1, 352(%rcx), %ymm0, %ymm0
6403 ; AVX2-FCP-NEXT: vmovaps 352(%rdi), %xmm1
6404 ; AVX2-FCP-NEXT: vinsertf128 $1, 352(%rdx), %ymm1, %ymm1
6405 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
6406 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6407 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
6408 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6409 ; AVX2-FCP-NEXT: vmovaps 384(%rsi), %xmm0
6410 ; AVX2-FCP-NEXT: vinsertf128 $1, 384(%rcx), %ymm0, %ymm0
6411 ; AVX2-FCP-NEXT: vmovaps 384(%rdi), %xmm1
6412 ; AVX2-FCP-NEXT: vinsertf128 $1, 384(%rdx), %ymm1, %ymm1
6413 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
6414 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6415 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
6416 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6417 ; AVX2-FCP-NEXT: vmovaps 416(%rsi), %xmm0
6418 ; AVX2-FCP-NEXT: vinsertf128 $1, 416(%rcx), %ymm0, %ymm0
6419 ; AVX2-FCP-NEXT: vmovaps 416(%rdi), %xmm1
6420 ; AVX2-FCP-NEXT: vinsertf128 $1, 416(%rdx), %ymm1, %ymm1
6421 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
6422 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6423 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
6424 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6425 ; AVX2-FCP-NEXT: vmovaps 448(%rsi), %xmm0
6426 ; AVX2-FCP-NEXT: vinsertf128 $1, 448(%rcx), %ymm0, %ymm0
6427 ; AVX2-FCP-NEXT: vmovaps 448(%rdi), %xmm1
6428 ; AVX2-FCP-NEXT: vinsertf128 $1, 448(%rdx), %ymm1, %ymm1
6429 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
6430 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6431 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
6432 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6433 ; AVX2-FCP-NEXT: vmovaps 480(%rsi), %xmm0
6434 ; AVX2-FCP-NEXT: vinsertf128 $1, 480(%rcx), %ymm0, %ymm0
6435 ; AVX2-FCP-NEXT: vmovaps 480(%rdi), %xmm1
6436 ; AVX2-FCP-NEXT: vinsertf128 $1, 480(%rdx), %ymm1, %ymm1
6437 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
6438 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6439 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
6440 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6441 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0
6442 ; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm1
6443 ; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm2
6444 ; AVX2-FCP-NEXT: vmovaps (%rcx), %ymm3
6445 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
6446 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
6447 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
6448 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6449 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
6450 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
6451 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
6452 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6453 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm0
6454 ; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm1
6455 ; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm2
6456 ; AVX2-FCP-NEXT: vmovaps 32(%rcx), %ymm3
6457 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
6458 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
6459 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
6460 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6461 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
6462 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
6463 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
6464 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6465 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm0
6466 ; AVX2-FCP-NEXT: vmovaps 64(%rsi), %ymm1
6467 ; AVX2-FCP-NEXT: vmovaps 64(%rdx), %ymm2
6468 ; AVX2-FCP-NEXT: vmovaps 64(%rcx), %ymm3
6469 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
6470 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
6471 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
6472 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6473 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
6474 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
6475 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
6476 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6477 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm0
6478 ; AVX2-FCP-NEXT: vmovaps 96(%rsi), %ymm1
6479 ; AVX2-FCP-NEXT: vmovaps 96(%rdx), %ymm2
6480 ; AVX2-FCP-NEXT: vmovaps 96(%rcx), %ymm3
6481 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
6482 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
6483 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
6484 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6485 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
6486 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
6487 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
6488 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6489 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm0
6490 ; AVX2-FCP-NEXT: vmovaps 128(%rsi), %ymm1
6491 ; AVX2-FCP-NEXT: vmovaps 128(%rdx), %ymm2
6492 ; AVX2-FCP-NEXT: vmovaps 128(%rcx), %ymm3
6493 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
6494 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
6495 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
6496 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6497 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
6498 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
6499 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
6500 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6501 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm0
6502 ; AVX2-FCP-NEXT: vmovaps 160(%rsi), %ymm1
6503 ; AVX2-FCP-NEXT: vmovaps 160(%rdx), %ymm2
6504 ; AVX2-FCP-NEXT: vmovaps 160(%rcx), %ymm3
6505 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
6506 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
6507 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
6508 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6509 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
6510 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
6511 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
6512 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6513 ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm0
6514 ; AVX2-FCP-NEXT: vmovaps 192(%rsi), %ymm1
6515 ; AVX2-FCP-NEXT: vmovaps 192(%rdx), %ymm2
6516 ; AVX2-FCP-NEXT: vmovaps 192(%rcx), %ymm3
6517 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
6518 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
6519 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
6520 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6521 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
6522 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
6523 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
6524 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6525 ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm0
6526 ; AVX2-FCP-NEXT: vmovaps 224(%rsi), %ymm1
6527 ; AVX2-FCP-NEXT: vmovaps 224(%rdx), %ymm2
6528 ; AVX2-FCP-NEXT: vmovaps 224(%rcx), %ymm3
6529 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
6530 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
6531 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
6532 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6533 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
6534 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
6535 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
6536 ; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
6537 ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm0
6538 ; AVX2-FCP-NEXT: vmovaps 256(%rsi), %ymm1
6539 ; AVX2-FCP-NEXT: vmovaps 256(%rdx), %ymm2
6540 ; AVX2-FCP-NEXT: vmovaps 256(%rcx), %ymm3
6541 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
6542 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
6543 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
6544 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6545 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
6546 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
6547 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
6548 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6549 ; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm0
6550 ; AVX2-FCP-NEXT: vmovaps 288(%rsi), %ymm1
6551 ; AVX2-FCP-NEXT: vmovaps 288(%rdx), %ymm2
6552 ; AVX2-FCP-NEXT: vmovaps 288(%rcx), %ymm3
6553 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
6554 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
6555 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
6556 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6557 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
6558 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
6559 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
6560 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6561 ; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm0
6562 ; AVX2-FCP-NEXT: vmovaps 320(%rsi), %ymm1
6563 ; AVX2-FCP-NEXT: vmovaps 320(%rdx), %ymm2
6564 ; AVX2-FCP-NEXT: vmovaps 320(%rcx), %ymm3
6565 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
6566 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
6567 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm5[2,3],ymm4[2,3]
6568 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
6569 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
6570 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3],ymm2[2,3]
6571 ; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm3
6572 ; AVX2-FCP-NEXT: vmovaps 352(%rsi), %ymm1
6573 ; AVX2-FCP-NEXT: vmovaps 352(%rdx), %ymm2
6574 ; AVX2-FCP-NEXT: vmovaps 352(%rcx), %ymm0
6575 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
6576 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
6577 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm15[2,3],ymm4[2,3]
6578 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
6579 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
6580 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm1[2,3],ymm0[2,3]
6581 ; AVX2-FCP-NEXT: vmovaps 384(%rdi), %ymm1
6582 ; AVX2-FCP-NEXT: vmovaps 384(%rsi), %ymm3
6583 ; AVX2-FCP-NEXT: vmovaps 384(%rdx), %ymm15
6584 ; AVX2-FCP-NEXT: vmovaps 384(%rcx), %ymm0
6585 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm15[0],ymm0[0],ymm15[2],ymm0[2]
6586 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
6587 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm14[2,3],ymm2[2,3]
6588 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3]
6589 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3]
6590 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3],ymm0[2,3]
6591 ; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm1
6592 ; AVX2-FCP-NEXT: vmovaps 416(%rsi), %ymm3
6593 ; AVX2-FCP-NEXT: vmovaps 416(%rdx), %ymm14
6594 ; AVX2-FCP-NEXT: vmovaps 416(%rcx), %ymm15
6595 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2]
6596 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
6597 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm13[2,3],ymm0[2,3]
6598 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm14[1],ymm15[1],ymm14[3],ymm15[3]
6599 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3]
6600 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm13[2,3]
6601 ; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm3
6602 ; AVX2-FCP-NEXT: vmovaps 448(%rsi), %ymm13
6603 ; AVX2-FCP-NEXT: vmovaps 448(%rdx), %ymm14
6604 ; AVX2-FCP-NEXT: vmovaps 448(%rcx), %ymm15
6605 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2]
6606 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm13[0],ymm3[2],ymm13[2]
6607 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3],ymm0[2,3]
6608 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm15[1],ymm14[3],ymm15[3]
6609 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm13[1],ymm3[3],ymm13[3]
6610 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm12[2,3]
6611 ; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm12
6612 ; AVX2-FCP-NEXT: vmovaps 480(%rsi), %ymm13
6613 ; AVX2-FCP-NEXT: vmovaps 480(%rdx), %ymm14
6614 ; AVX2-FCP-NEXT: vmovaps 480(%rcx), %ymm15
6615 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2]
6616 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm12[0],ymm13[0],ymm12[2],ymm13[2]
6617 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3]
6618 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm14[1],ymm15[1],ymm14[3],ymm15[3]
6619 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm13[1],ymm12[3],ymm13[3]
6620 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
6621 ; AVX2-FCP-NEXT: vmovaps %ymm11, 2016(%r8)
6622 ; AVX2-FCP-NEXT: vmovaps %ymm0, 1984(%r8)
6623 ; AVX2-FCP-NEXT: vmovaps %ymm3, 1888(%r8)
6624 ; AVX2-FCP-NEXT: vmovaps %ymm1, 1856(%r8)
6625 ; AVX2-FCP-NEXT: vmovaps %ymm2, 1760(%r8)
6626 ; AVX2-FCP-NEXT: vmovaps %ymm4, 1728(%r8)
6627 ; AVX2-FCP-NEXT: vmovaps %ymm5, 1632(%r8)
6628 ; AVX2-FCP-NEXT: vmovaps %ymm6, 1600(%r8)
6629 ; AVX2-FCP-NEXT: vmovaps %ymm7, 1504(%r8)
6630 ; AVX2-FCP-NEXT: vmovaps %ymm8, 1472(%r8)
6631 ; AVX2-FCP-NEXT: vmovaps %ymm9, 1376(%r8)
6632 ; AVX2-FCP-NEXT: vmovaps %ymm10, 1344(%r8)
6633 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6634 ; AVX2-FCP-NEXT: vmovaps %ymm0, 1248(%r8)
6635 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6636 ; AVX2-FCP-NEXT: vmovaps %ymm0, 1216(%r8)
6637 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6638 ; AVX2-FCP-NEXT: vmovaps %ymm0, 1120(%r8)
6639 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6640 ; AVX2-FCP-NEXT: vmovaps %ymm0, 1088(%r8)
6641 ; AVX2-FCP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
6642 ; AVX2-FCP-NEXT: vmovaps %ymm0, 992(%r8)
6643 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6644 ; AVX2-FCP-NEXT: vmovaps %ymm0, 960(%r8)
6645 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6646 ; AVX2-FCP-NEXT: vmovaps %ymm0, 864(%r8)
6647 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6648 ; AVX2-FCP-NEXT: vmovaps %ymm0, 832(%r8)
6649 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6650 ; AVX2-FCP-NEXT: vmovaps %ymm0, 736(%r8)
6651 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6652 ; AVX2-FCP-NEXT: vmovaps %ymm0, 704(%r8)
6653 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6654 ; AVX2-FCP-NEXT: vmovaps %ymm0, 608(%r8)
6655 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6656 ; AVX2-FCP-NEXT: vmovaps %ymm0, 576(%r8)
6657 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6658 ; AVX2-FCP-NEXT: vmovaps %ymm0, 480(%r8)
6659 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6660 ; AVX2-FCP-NEXT: vmovaps %ymm0, 448(%r8)
6661 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6662 ; AVX2-FCP-NEXT: vmovaps %ymm0, 352(%r8)
6663 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6664 ; AVX2-FCP-NEXT: vmovaps %ymm0, 320(%r8)
6665 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6666 ; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%r8)
6667 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6668 ; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%r8)
6669 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6670 ; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%r8)
6671 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6672 ; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%r8)
6673 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6674 ; AVX2-FCP-NEXT: vmovaps %ymm0, 1952(%r8)
6675 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6676 ; AVX2-FCP-NEXT: vmovaps %ymm0, 1920(%r8)
6677 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6678 ; AVX2-FCP-NEXT: vmovaps %ymm0, 1824(%r8)
6679 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6680 ; AVX2-FCP-NEXT: vmovaps %ymm0, 1792(%r8)
6681 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6682 ; AVX2-FCP-NEXT: vmovaps %ymm0, 1696(%r8)
6683 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6684 ; AVX2-FCP-NEXT: vmovaps %ymm0, 1664(%r8)
6685 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6686 ; AVX2-FCP-NEXT: vmovaps %ymm0, 1568(%r8)
6687 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6688 ; AVX2-FCP-NEXT: vmovaps %ymm0, 1536(%r8)
6689 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6690 ; AVX2-FCP-NEXT: vmovaps %ymm0, 1440(%r8)
6691 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6692 ; AVX2-FCP-NEXT: vmovaps %ymm0, 1408(%r8)
6693 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6694 ; AVX2-FCP-NEXT: vmovaps %ymm0, 1312(%r8)
6695 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6696 ; AVX2-FCP-NEXT: vmovaps %ymm0, 1280(%r8)
6697 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6698 ; AVX2-FCP-NEXT: vmovaps %ymm0, 1184(%r8)
6699 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6700 ; AVX2-FCP-NEXT: vmovaps %ymm0, 1152(%r8)
6701 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6702 ; AVX2-FCP-NEXT: vmovaps %ymm0, 1056(%r8)
6703 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6704 ; AVX2-FCP-NEXT: vmovaps %ymm0, 1024(%r8)
6705 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6706 ; AVX2-FCP-NEXT: vmovaps %ymm0, 928(%r8)
6707 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6708 ; AVX2-FCP-NEXT: vmovaps %ymm0, 896(%r8)
6709 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6710 ; AVX2-FCP-NEXT: vmovaps %ymm0, 800(%r8)
6711 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6712 ; AVX2-FCP-NEXT: vmovaps %ymm0, 768(%r8)
6713 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6714 ; AVX2-FCP-NEXT: vmovaps %ymm0, 672(%r8)
6715 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6716 ; AVX2-FCP-NEXT: vmovaps %ymm0, 640(%r8)
6717 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6718 ; AVX2-FCP-NEXT: vmovaps %ymm0, 544(%r8)
6719 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6720 ; AVX2-FCP-NEXT: vmovaps %ymm0, 512(%r8)
6721 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6722 ; AVX2-FCP-NEXT: vmovaps %ymm0, 416(%r8)
6723 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6724 ; AVX2-FCP-NEXT: vmovaps %ymm0, 384(%r8)
6725 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6726 ; AVX2-FCP-NEXT: vmovaps %ymm0, 288(%r8)
6727 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6728 ; AVX2-FCP-NEXT: vmovaps %ymm0, 256(%r8)
6729 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6730 ; AVX2-FCP-NEXT: vmovaps %ymm0, 160(%r8)
6731 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6732 ; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%r8)
6733 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6734 ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%r8)
6735 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6736 ; AVX2-FCP-NEXT: vmovaps %ymm0, (%r8)
6737 ; AVX2-FCP-NEXT: addq $1544, %rsp # imm = 0x608
6738 ; AVX2-FCP-NEXT: vzeroupper
6739 ; AVX2-FCP-NEXT: retq
6741 ; AVX512-LABEL: store_i64_stride4_vf64:
6743 ; AVX512-NEXT: subq $2120, %rsp # imm = 0x848
6744 ; AVX512-NEXT: vmovdqa64 384(%rdx), %zmm6
6745 ; AVX512-NEXT: vmovdqa64 320(%rdx), %zmm7
6746 ; AVX512-NEXT: vmovdqa64 256(%rdx), %zmm8
6747 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm14
6748 ; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm12
6749 ; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm11
6750 ; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm10
6751 ; AVX512-NEXT: vmovdqa64 384(%rcx), %zmm23
6752 ; AVX512-NEXT: vmovdqa64 320(%rcx), %zmm3
6753 ; AVX512-NEXT: vmovdqa64 256(%rcx), %zmm0
6754 ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm4
6755 ; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm5
6756 ; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm2
6757 ; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm1
6758 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9]
6759 ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm13
6760 ; AVX512-NEXT: vpermt2q %zmm4, %zmm9, %zmm13
6761 ; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6762 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,3,11]
6763 ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm15
6764 ; AVX512-NEXT: vpermt2q %zmm4, %zmm13, %zmm15
6765 ; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6766 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13]
6767 ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm16
6768 ; AVX512-NEXT: vpermt2q %zmm4, %zmm15, %zmm16
6769 ; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6770 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15]
6771 ; AVX512-NEXT: vpermt2q %zmm4, %zmm26, %zmm14
6772 ; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6773 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm4
6774 ; AVX512-NEXT: vpermt2q %zmm5, %zmm9, %zmm12
6775 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6776 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm12
6777 ; AVX512-NEXT: vpermt2q %zmm5, %zmm13, %zmm12
6778 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6779 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm12
6780 ; AVX512-NEXT: vpermt2q %zmm5, %zmm15, %zmm12
6781 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6782 ; AVX512-NEXT: vpermt2q %zmm5, %zmm26, %zmm4
6783 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6784 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm4
6785 ; AVX512-NEXT: vpermt2q %zmm2, %zmm9, %zmm4
6786 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6787 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm4
6788 ; AVX512-NEXT: vpermt2q %zmm2, %zmm13, %zmm4
6789 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6790 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm4
6791 ; AVX512-NEXT: vpermt2q %zmm2, %zmm15, %zmm4
6792 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6793 ; AVX512-NEXT: vpermt2q %zmm2, %zmm26, %zmm11
6794 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6795 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm2
6796 ; AVX512-NEXT: vpermt2q %zmm1, %zmm9, %zmm2
6797 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6798 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm2
6799 ; AVX512-NEXT: vpermt2q %zmm1, %zmm13, %zmm2
6800 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6801 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm2
6802 ; AVX512-NEXT: vpermt2q %zmm1, %zmm15, %zmm2
6803 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6804 ; AVX512-NEXT: vpermt2q %zmm1, %zmm26, %zmm10
6805 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6806 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm1
6807 ; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm1
6808 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6809 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm1
6810 ; AVX512-NEXT: vpermt2q %zmm0, %zmm13, %zmm1
6811 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6812 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm1
6813 ; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm1
6814 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6815 ; AVX512-NEXT: vpermt2q %zmm0, %zmm26, %zmm8
6816 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6817 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm0
6818 ; AVX512-NEXT: vpermt2q %zmm3, %zmm9, %zmm0
6819 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6820 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm0
6821 ; AVX512-NEXT: vpermt2q %zmm3, %zmm13, %zmm0
6822 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6823 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm0
6824 ; AVX512-NEXT: vpermt2q %zmm3, %zmm15, %zmm0
6825 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6826 ; AVX512-NEXT: vpermt2q %zmm3, %zmm26, %zmm7
6827 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6828 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm0
6829 ; AVX512-NEXT: vpermt2q %zmm23, %zmm9, %zmm0
6830 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6831 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm0
6832 ; AVX512-NEXT: vpermt2q %zmm23, %zmm13, %zmm0
6833 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6834 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm0
6835 ; AVX512-NEXT: vpermt2q %zmm23, %zmm15, %zmm0
6836 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6837 ; AVX512-NEXT: vpermt2q %zmm23, %zmm26, %zmm6
6838 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6839 ; AVX512-NEXT: vmovdqa64 448(%rdx), %zmm28
6840 ; AVX512-NEXT: vmovdqa64 448(%rcx), %zmm0
6841 ; AVX512-NEXT: vpermi2q %zmm0, %zmm28, %zmm9
6842 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6843 ; AVX512-NEXT: vpermi2q %zmm0, %zmm28, %zmm13
6844 ; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6845 ; AVX512-NEXT: vpermi2q %zmm0, %zmm28, %zmm15
6846 ; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6847 ; AVX512-NEXT: vpermt2q %zmm0, %zmm26, %zmm28
6848 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm26
6849 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm3
6850 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,1,9,0,0]
6851 ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm0
6852 ; AVX512-NEXT: vpermt2q %zmm3, %zmm7, %zmm0
6853 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm30
6854 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,10,0,0,3,11,0,0]
6855 ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm0
6856 ; AVX512-NEXT: vpermt2q %zmm3, %zmm5, %zmm0
6857 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6858 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,12,0,0,5,13,0,0]
6859 ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm0
6860 ; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm0
6861 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6862 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm22 = [6,14,0,0,7,15,0,0]
6863 ; AVX512-NEXT: vpermt2q %zmm3, %zmm22, %zmm26
6864 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm20
6865 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm3
6866 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm0
6867 ; AVX512-NEXT: vpermt2q %zmm3, %zmm7, %zmm0
6868 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6869 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm0
6870 ; AVX512-NEXT: vpermt2q %zmm3, %zmm5, %zmm0
6871 ; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
6872 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm29
6873 ; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm29
6874 ; AVX512-NEXT: vpermt2q %zmm3, %zmm22, %zmm20
6875 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm18
6876 ; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm3
6877 ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm27
6878 ; AVX512-NEXT: vpermt2q %zmm3, %zmm7, %zmm27
6879 ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm25
6880 ; AVX512-NEXT: vpermt2q %zmm3, %zmm5, %zmm25
6881 ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm24
6882 ; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm24
6883 ; AVX512-NEXT: vpermt2q %zmm3, %zmm22, %zmm18
6884 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm19
6885 ; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm4
6886 ; AVX512-NEXT: vmovdqa64 %zmm19, %zmm23
6887 ; AVX512-NEXT: vpermt2q %zmm4, %zmm7, %zmm23
6888 ; AVX512-NEXT: vmovdqa64 %zmm19, %zmm21
6889 ; AVX512-NEXT: vpermt2q %zmm4, %zmm5, %zmm21
6890 ; AVX512-NEXT: vmovdqa64 %zmm19, %zmm17
6891 ; AVX512-NEXT: vpermt2q %zmm4, %zmm2, %zmm17
6892 ; AVX512-NEXT: vpermt2q %zmm4, %zmm22, %zmm19
6893 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm4
6894 ; AVX512-NEXT: vmovdqa64 256(%rsi), %zmm6
6895 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm16
6896 ; AVX512-NEXT: vpermt2q %zmm6, %zmm7, %zmm16
6897 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm15
6898 ; AVX512-NEXT: vpermt2q %zmm6, %zmm5, %zmm15
6899 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm14
6900 ; AVX512-NEXT: vpermt2q %zmm6, %zmm2, %zmm14
6901 ; AVX512-NEXT: vpermt2q %zmm6, %zmm22, %zmm4
6902 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm6
6903 ; AVX512-NEXT: vmovdqa64 320(%rsi), %zmm0
6904 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm13
6905 ; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm13
6906 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm12
6907 ; AVX512-NEXT: vpermt2q %zmm0, %zmm5, %zmm12
6908 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm11
6909 ; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm11
6910 ; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm6
6911 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm9
6912 ; AVX512-NEXT: vmovdqa64 384(%rsi), %zmm0
6913 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm10
6914 ; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm10
6915 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm8
6916 ; AVX512-NEXT: vpermt2q %zmm0, %zmm5, %zmm8
6917 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm3
6918 ; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm3
6919 ; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm9
6920 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm1
6921 ; AVX512-NEXT: vmovdqa64 448(%rsi), %zmm0
6922 ; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm7
6923 ; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm5
6924 ; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
6925 ; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm1
6926 ; AVX512-NEXT: movb $-52, %al
6927 ; AVX512-NEXT: kmovw %eax, %k1
6928 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6929 ; AVX512-NEXT: vmovdqa64 %zmm30, %zmm22
6930 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
6931 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6932 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
6933 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1}
6934 ; AVX512-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6935 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6936 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
6937 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1}
6938 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6939 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1}
6940 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6941 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
6942 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1}
6943 ; AVX512-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6944 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6945 ; AVX512-NEXT: vmovdqu64 (%rsp), %zmm30 # 64-byte Reload
6946 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1}
6947 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6948 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1}
6949 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6950 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1}
6951 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6952 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
6953 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6954 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1}
6955 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6956 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1}
6957 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6958 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1}
6959 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6960 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1}
6961 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6962 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1}
6963 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6964 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1}
6965 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6966 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
6967 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6968 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1}
6969 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6970 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1}
6971 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6972 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1}
6973 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6974 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1}
6975 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6976 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1}
6977 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6978 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1}
6979 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6980 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
6981 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6982 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1}
6983 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6984 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
6985 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6986 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1}
6987 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6988 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1}
6989 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6990 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1}
6991 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6992 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1}
6993 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6994 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1}
6995 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6996 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
6997 ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm1 {%k1}
6998 ; AVX512-NEXT: vmovdqa64 %zmm1, 1984(%r8)
6999 ; AVX512-NEXT: vmovdqa64 %zmm2, 1920(%r8)
7000 ; AVX512-NEXT: vmovdqa64 %zmm5, 1856(%r8)
7001 ; AVX512-NEXT: vmovdqa64 %zmm7, 1792(%r8)
7002 ; AVX512-NEXT: vmovdqa64 %zmm9, 1728(%r8)
7003 ; AVX512-NEXT: vmovdqa64 %zmm3, 1664(%r8)
7004 ; AVX512-NEXT: vmovdqa64 %zmm8, 1600(%r8)
7005 ; AVX512-NEXT: vmovdqa64 %zmm10, 1536(%r8)
7006 ; AVX512-NEXT: vmovdqa64 %zmm6, 1472(%r8)
7007 ; AVX512-NEXT: vmovdqa64 %zmm11, 1408(%r8)
7008 ; AVX512-NEXT: vmovdqa64 %zmm12, 1344(%r8)
7009 ; AVX512-NEXT: vmovdqa64 %zmm13, 1280(%r8)
7010 ; AVX512-NEXT: vmovdqa64 %zmm4, 1216(%r8)
7011 ; AVX512-NEXT: vmovdqa64 %zmm14, 1152(%r8)
7012 ; AVX512-NEXT: vmovdqa64 %zmm15, 1088(%r8)
7013 ; AVX512-NEXT: vmovdqa64 %zmm16, 1024(%r8)
7014 ; AVX512-NEXT: vmovdqa64 %zmm19, 960(%r8)
7015 ; AVX512-NEXT: vmovdqa64 %zmm17, 896(%r8)
7016 ; AVX512-NEXT: vmovdqa64 %zmm21, 832(%r8)
7017 ; AVX512-NEXT: vmovdqa64 %zmm23, 768(%r8)
7018 ; AVX512-NEXT: vmovdqa64 %zmm18, 704(%r8)
7019 ; AVX512-NEXT: vmovdqa64 %zmm24, 640(%r8)
7020 ; AVX512-NEXT: vmovdqa64 %zmm25, 576(%r8)
7021 ; AVX512-NEXT: vmovdqa64 %zmm27, 512(%r8)
7022 ; AVX512-NEXT: vmovdqa64 %zmm20, 448(%r8)
7023 ; AVX512-NEXT: vmovdqa64 %zmm29, 384(%r8)
7024 ; AVX512-NEXT: vmovdqa64 %zmm30, 320(%r8)
7025 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7026 ; AVX512-NEXT: vmovaps %zmm0, 256(%r8)
7027 ; AVX512-NEXT: vmovdqa64 %zmm26, 192(%r8)
7028 ; AVX512-NEXT: vmovdqa64 %zmm31, 128(%r8)
7029 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7030 ; AVX512-NEXT: vmovaps %zmm0, 64(%r8)
7031 ; AVX512-NEXT: vmovdqa64 %zmm22, (%r8)
7032 ; AVX512-NEXT: addq $2120, %rsp # imm = 0x848
7033 ; AVX512-NEXT: vzeroupper
7036 ; AVX512-FCP-LABEL: store_i64_stride4_vf64:
7037 ; AVX512-FCP: # %bb.0:
7038 ; AVX512-FCP-NEXT: subq $2120, %rsp # imm = 0x848
7039 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdx), %zmm6
7040 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdx), %zmm7
7041 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdx), %zmm8
7042 ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm14
7043 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm12
7044 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %zmm11
7045 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdx), %zmm10
7046 ; AVX512-FCP-NEXT: vmovdqa64 384(%rcx), %zmm23
7047 ; AVX512-FCP-NEXT: vmovdqa64 320(%rcx), %zmm3
7048 ; AVX512-FCP-NEXT: vmovdqa64 256(%rcx), %zmm0
7049 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm4
7050 ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm5
7051 ; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm2
7052 ; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1
7053 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9]
7054 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm13
7055 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm13
7056 ; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7057 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,3,11]
7058 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm15
7059 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm15
7060 ; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7061 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13]
7062 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm16
7063 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm16
7064 ; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7065 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15]
7066 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm26, %zmm14
7067 ; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7068 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm4
7069 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm9, %zmm12
7070 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7071 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm12
7072 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm12
7073 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7074 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm12
7075 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm12
7076 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7077 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm26, %zmm4
7078 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7079 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm4
7080 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm9, %zmm4
7081 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7082 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm4
7083 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm4
7084 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7085 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm4
7086 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm4
7087 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7088 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm26, %zmm11
7089 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7090 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm2
7091 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm2
7092 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7093 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm2
7094 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm2
7095 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7096 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm2
7097 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm2
7098 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7099 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm10
7100 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7101 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
7102 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm1
7103 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7104 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
7105 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm1
7106 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7107 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
7108 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm1
7109 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7110 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm8
7111 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7112 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm0
7113 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm0
7114 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7115 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm0
7116 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm13, %zmm0
7117 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7118 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm0
7119 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm15, %zmm0
7120 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7121 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm7
7122 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7123 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm0
7124 ; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm0
7125 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7126 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm0
7127 ; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm13, %zmm0
7128 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7129 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm0
7130 ; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm15, %zmm0
7131 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7132 ; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm26, %zmm6
7133 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7134 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdx), %zmm28
7135 ; AVX512-FCP-NEXT: vmovdqa64 448(%rcx), %zmm0
7136 ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm28, %zmm9
7137 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7138 ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm28, %zmm13
7139 ; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7140 ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm28, %zmm15
7141 ; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7142 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm28
7143 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm26
7144 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm3
7145 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,1,9,0,0]
7146 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm0
7147 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm0
7148 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm30
7149 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,10,0,0,3,11,0,0]
7150 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm0
7151 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm0
7152 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7153 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,12,0,0,5,13,0,0]
7154 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm0
7155 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm0
7156 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7157 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [6,14,0,0,7,15,0,0]
7158 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm26
7159 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm20
7160 ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3
7161 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm0
7162 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm0
7163 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7164 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm0
7165 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm0
7166 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
7167 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm29
7168 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm29
7169 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm20
7170 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm18
7171 ; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm3
7172 ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm27
7173 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm27
7174 ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm25
7175 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm25
7176 ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm24
7177 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm24
7178 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm18
7179 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm19
7180 ; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm4
7181 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm23
7182 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm23
7183 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm21
7184 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm21
7185 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm17
7186 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm17
7187 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm22, %zmm19
7188 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4
7189 ; AVX512-FCP-NEXT: vmovdqa64 256(%rsi), %zmm6
7190 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm16
7191 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm16
7192 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm15
7193 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm15
7194 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm14
7195 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm14
7196 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm22, %zmm4
7197 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6
7198 ; AVX512-FCP-NEXT: vmovdqa64 320(%rsi), %zmm0
7199 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm13
7200 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm13
7201 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm12
7202 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm12
7203 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm11
7204 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm11
7205 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm6
7206 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm9
7207 ; AVX512-FCP-NEXT: vmovdqa64 384(%rsi), %zmm0
7208 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm10
7209 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm10
7210 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm8
7211 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm8
7212 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm3
7213 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm3
7214 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm9
7215 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1
7216 ; AVX512-FCP-NEXT: vmovdqa64 448(%rsi), %zmm0
7217 ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm7
7218 ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm5
7219 ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
7220 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm1
7221 ; AVX512-FCP-NEXT: movb $-52, %al
7222 ; AVX512-FCP-NEXT: kmovw %eax, %k1
7223 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7224 ; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm22
7225 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
7226 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7227 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
7228 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1}
7229 ; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7230 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7231 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
7232 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1}
7233 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7234 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1}
7235 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7236 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
7237 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1}
7238 ; AVX512-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7239 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7240 ; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm30 # 64-byte Reload
7241 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1}
7242 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7243 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1}
7244 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7245 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1}
7246 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7247 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
7248 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7249 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1}
7250 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7251 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1}
7252 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7253 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1}
7254 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7255 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1}
7256 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7257 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1}
7258 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7259 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1}
7260 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7261 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
7262 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7263 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1}
7264 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7265 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1}
7266 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7267 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1}
7268 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7269 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1}
7270 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7271 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1}
7272 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7273 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1}
7274 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7275 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
7276 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7277 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1}
7278 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7279 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
7280 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7281 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1}
7282 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7283 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1}
7284 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7285 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1}
7286 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7287 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1}
7288 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7289 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1}
7290 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7291 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
7292 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 {%k1}
7293 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 1984(%r8)
7294 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 1920(%r8)
7295 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 1856(%r8)
7296 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 1792(%r8)
7297 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 1728(%r8)
7298 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 1664(%r8)
7299 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 1600(%r8)
7300 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 1536(%r8)
7301 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 1472(%r8)
7302 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 1408(%r8)
7303 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 1344(%r8)
7304 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 1280(%r8)
7305 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 1216(%r8)
7306 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 1152(%r8)
7307 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 1088(%r8)
7308 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 1024(%r8)
7309 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 960(%r8)
7310 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 896(%r8)
7311 ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 832(%r8)
7312 ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 768(%r8)
7313 ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 704(%r8)
7314 ; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 640(%r8)
7315 ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 576(%r8)
7316 ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 512(%r8)
7317 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 448(%r8)
7318 ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 384(%r8)
7319 ; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 320(%r8)
7320 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7321 ; AVX512-FCP-NEXT: vmovaps %zmm0, 256(%r8)
7322 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 192(%r8)
7323 ; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 128(%r8)
7324 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7325 ; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%r8)
7326 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, (%r8)
7327 ; AVX512-FCP-NEXT: addq $2120, %rsp # imm = 0x848
7328 ; AVX512-FCP-NEXT: vzeroupper
7329 ; AVX512-FCP-NEXT: retq
7331 ; AVX512DQ-LABEL: store_i64_stride4_vf64:
7332 ; AVX512DQ: # %bb.0:
7333 ; AVX512DQ-NEXT: subq $2120, %rsp # imm = 0x848
7334 ; AVX512DQ-NEXT: vmovdqa64 384(%rdx), %zmm6
7335 ; AVX512DQ-NEXT: vmovdqa64 320(%rdx), %zmm7
7336 ; AVX512DQ-NEXT: vmovdqa64 256(%rdx), %zmm8
7337 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm14
7338 ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm12
7339 ; AVX512DQ-NEXT: vmovdqa64 128(%rdx), %zmm11
7340 ; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm10
7341 ; AVX512DQ-NEXT: vmovdqa64 384(%rcx), %zmm23
7342 ; AVX512DQ-NEXT: vmovdqa64 320(%rcx), %zmm3
7343 ; AVX512DQ-NEXT: vmovdqa64 256(%rcx), %zmm0
7344 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm4
7345 ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm5
7346 ; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm2
7347 ; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm1
7348 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9]
7349 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm13
7350 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm9, %zmm13
7351 ; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7352 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,3,11]
7353 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm15
7354 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm13, %zmm15
7355 ; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7356 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13]
7357 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm16
7358 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm15, %zmm16
7359 ; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7360 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15]
7361 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm26, %zmm14
7362 ; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7363 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm4
7364 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm9, %zmm12
7365 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7366 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm12
7367 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm13, %zmm12
7368 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7369 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm12
7370 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm15, %zmm12
7371 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7372 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm26, %zmm4
7373 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7374 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm4
7375 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm9, %zmm4
7376 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7377 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm4
7378 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm13, %zmm4
7379 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7380 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm4
7381 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm15, %zmm4
7382 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7383 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm26, %zmm11
7384 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7385 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm2
7386 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm9, %zmm2
7387 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7388 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm2
7389 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm13, %zmm2
7390 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7391 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm2
7392 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm15, %zmm2
7393 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7394 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm26, %zmm10
7395 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7396 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm1
7397 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm9, %zmm1
7398 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7399 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm1
7400 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm13, %zmm1
7401 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7402 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm1
7403 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm1
7404 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7405 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm26, %zmm8
7406 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7407 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm0
7408 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm9, %zmm0
7409 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7410 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm0
7411 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm13, %zmm0
7412 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7413 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm0
7414 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm15, %zmm0
7415 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7416 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm26, %zmm7
7417 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7418 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm0
7419 ; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm9, %zmm0
7420 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7421 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm0
7422 ; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm13, %zmm0
7423 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7424 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm0
7425 ; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm15, %zmm0
7426 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7427 ; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm26, %zmm6
7428 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7429 ; AVX512DQ-NEXT: vmovdqa64 448(%rdx), %zmm28
7430 ; AVX512DQ-NEXT: vmovdqa64 448(%rcx), %zmm0
7431 ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm28, %zmm9
7432 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7433 ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm28, %zmm13
7434 ; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7435 ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm28, %zmm15
7436 ; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7437 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm26, %zmm28
7438 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm26
7439 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm3
7440 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,1,9,0,0]
7441 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm0
7442 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm7, %zmm0
7443 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm30
7444 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,10,0,0,3,11,0,0]
7445 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm0
7446 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm5, %zmm0
7447 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7448 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,12,0,0,5,13,0,0]
7449 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm0
7450 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm0
7451 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7452 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm22 = [6,14,0,0,7,15,0,0]
7453 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm22, %zmm26
7454 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm20
7455 ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm3
7456 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm0
7457 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm7, %zmm0
7458 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7459 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm0
7460 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm5, %zmm0
7461 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
7462 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm29
7463 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm29
7464 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm22, %zmm20
7465 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm18
7466 ; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm3
7467 ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm27
7468 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm7, %zmm27
7469 ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm25
7470 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm5, %zmm25
7471 ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm24
7472 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm24
7473 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm22, %zmm18
7474 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm19
7475 ; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm4
7476 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm23
7477 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm7, %zmm23
7478 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm21
7479 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm5, %zmm21
7480 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm17
7481 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm2, %zmm17
7482 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm22, %zmm19
7483 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm4
7484 ; AVX512DQ-NEXT: vmovdqa64 256(%rsi), %zmm6
7485 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm16
7486 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm7, %zmm16
7487 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm15
7488 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm5, %zmm15
7489 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm14
7490 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm2, %zmm14
7491 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm22, %zmm4
7492 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm6
7493 ; AVX512DQ-NEXT: vmovdqa64 320(%rsi), %zmm0
7494 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm13
7495 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm7, %zmm13
7496 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm12
7497 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm5, %zmm12
7498 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm11
7499 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm11
7500 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm22, %zmm6
7501 ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm9
7502 ; AVX512DQ-NEXT: vmovdqa64 384(%rsi), %zmm0
7503 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm10
7504 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm7, %zmm10
7505 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm8
7506 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm5, %zmm8
7507 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm3
7508 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm3
7509 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm22, %zmm9
7510 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm1
7511 ; AVX512DQ-NEXT: vmovdqa64 448(%rsi), %zmm0
7512 ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm7
7513 ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm5
7514 ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
7515 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm22, %zmm1
7516 ; AVX512DQ-NEXT: movb $-52, %al
7517 ; AVX512DQ-NEXT: kmovw %eax, %k1
7518 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7519 ; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm22
7520 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
7521 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7522 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
7523 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1}
7524 ; AVX512DQ-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7525 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7526 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
7527 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1}
7528 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7529 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1}
7530 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7531 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
7532 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1}
7533 ; AVX512DQ-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7534 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7535 ; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm30 # 64-byte Reload
7536 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1}
7537 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7538 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1}
7539 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7540 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1}
7541 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7542 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
7543 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7544 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1}
7545 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7546 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1}
7547 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7548 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1}
7549 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7550 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1}
7551 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7552 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1}
7553 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7554 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1}
7555 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7556 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
7557 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7558 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1}
7559 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7560 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1}
7561 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7562 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1}
7563 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7564 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1}
7565 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7566 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1}
7567 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7568 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1}
7569 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7570 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
7571 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7572 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1}
7573 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7574 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
7575 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7576 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1}
7577 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7578 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1}
7579 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7580 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1}
7581 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7582 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1}
7583 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7584 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1}
7585 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7586 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
7587 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm1 {%k1}
7588 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 1984(%r8)
7589 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 1920(%r8)
7590 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 1856(%r8)
7591 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 1792(%r8)
7592 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 1728(%r8)
7593 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1664(%r8)
7594 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 1600(%r8)
7595 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1536(%r8)
7596 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1472(%r8)
7597 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 1408(%r8)
7598 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1344(%r8)
7599 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 1280(%r8)
7600 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1216(%r8)
7601 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, 1152(%r8)
7602 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, 1088(%r8)
7603 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 1024(%r8)
7604 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, 960(%r8)
7605 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, 896(%r8)
7606 ; AVX512DQ-NEXT: vmovdqa64 %zmm21, 832(%r8)
7607 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, 768(%r8)
7608 ; AVX512DQ-NEXT: vmovdqa64 %zmm18, 704(%r8)
7609 ; AVX512DQ-NEXT: vmovdqa64 %zmm24, 640(%r8)
7610 ; AVX512DQ-NEXT: vmovdqa64 %zmm25, 576(%r8)
7611 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, 512(%r8)
7612 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, 448(%r8)
7613 ; AVX512DQ-NEXT: vmovdqa64 %zmm29, 384(%r8)
7614 ; AVX512DQ-NEXT: vmovdqa64 %zmm30, 320(%r8)
7615 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7616 ; AVX512DQ-NEXT: vmovaps %zmm0, 256(%r8)
7617 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, 192(%r8)
7618 ; AVX512DQ-NEXT: vmovdqa64 %zmm31, 128(%r8)
7619 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7620 ; AVX512DQ-NEXT: vmovaps %zmm0, 64(%r8)
7621 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, (%r8)
7622 ; AVX512DQ-NEXT: addq $2120, %rsp # imm = 0x848
7623 ; AVX512DQ-NEXT: vzeroupper
7624 ; AVX512DQ-NEXT: retq
7626 ; AVX512DQ-FCP-LABEL: store_i64_stride4_vf64:
7627 ; AVX512DQ-FCP: # %bb.0:
7628 ; AVX512DQ-FCP-NEXT: subq $2120, %rsp # imm = 0x848
7629 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdx), %zmm6
7630 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdx), %zmm7
7631 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdx), %zmm8
7632 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm14
7633 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm12
7634 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdx), %zmm11
7635 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdx), %zmm10
7636 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rcx), %zmm23
7637 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rcx), %zmm3
7638 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rcx), %zmm0
7639 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm4
7640 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm5
7641 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm2
7642 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1
7643 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9]
7644 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm13
7645 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm13
7646 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7647 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,3,11]
7648 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm15
7649 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm15
7650 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7651 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13]
7652 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm16
7653 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm16
7654 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7655 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15]
7656 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm26, %zmm14
7657 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7658 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm4
7659 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm9, %zmm12
7660 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7661 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm12
7662 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm12
7663 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7664 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm12
7665 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm12
7666 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7667 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm26, %zmm4
7668 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7669 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm4
7670 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm9, %zmm4
7671 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7672 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm4
7673 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm4
7674 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7675 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm4
7676 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm4
7677 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7678 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm26, %zmm11
7679 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7680 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm2
7681 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm2
7682 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7683 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm2
7684 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm2
7685 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7686 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm2
7687 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm2
7688 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7689 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm10
7690 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7691 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
7692 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm1
7693 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7694 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
7695 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm1
7696 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7697 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
7698 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm1
7699 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7700 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm8
7701 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7702 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm0
7703 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm0
7704 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7705 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm0
7706 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm13, %zmm0
7707 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7708 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm0
7709 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm15, %zmm0
7710 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7711 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm7
7712 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7713 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm0
7714 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm0
7715 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7716 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm0
7717 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm13, %zmm0
7718 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7719 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm0
7720 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm15, %zmm0
7721 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7722 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm26, %zmm6
7723 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7724 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdx), %zmm28
7725 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rcx), %zmm0
7726 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm28, %zmm9
7727 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7728 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm28, %zmm13
7729 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7730 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm28, %zmm15
7731 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7732 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm28
7733 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm26
7734 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm3
7735 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,1,9,0,0]
7736 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm0
7737 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm0
7738 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm30
7739 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,10,0,0,3,11,0,0]
7740 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm0
7741 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm0
7742 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7743 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,12,0,0,5,13,0,0]
7744 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm0
7745 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm0
7746 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7747 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [6,14,0,0,7,15,0,0]
7748 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm26
7749 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm20
7750 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3
7751 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm0
7752 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm0
7753 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7754 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm0
7755 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm0
7756 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
7757 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm29
7758 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm29
7759 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm20
7760 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm18
7761 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm3
7762 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm27
7763 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm27
7764 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm25
7765 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm25
7766 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm24
7767 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm24
7768 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm18
7769 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm19
7770 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm4
7771 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm23
7772 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm23
7773 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm21
7774 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm21
7775 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm17
7776 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm17
7777 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm22, %zmm19
7778 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4
7779 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rsi), %zmm6
7780 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm16
7781 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm16
7782 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm15
7783 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm15
7784 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm14
7785 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm14
7786 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm22, %zmm4
7787 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6
7788 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rsi), %zmm0
7789 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm13
7790 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm13
7791 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm12
7792 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm12
7793 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm11
7794 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm11
7795 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm6
7796 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm9
7797 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rsi), %zmm0
7798 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm10
7799 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm10
7800 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm8
7801 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm8
7802 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm3
7803 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm3
7804 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm9
7805 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1
7806 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rsi), %zmm0
7807 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm7
7808 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm5
7809 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
7810 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm1
7811 ; AVX512DQ-FCP-NEXT: movb $-52, %al
7812 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
7813 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7814 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm22
7815 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
7816 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7817 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
7818 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1}
7819 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7820 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7821 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
7822 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1}
7823 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7824 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1}
7825 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7826 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
7827 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1}
7828 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7829 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7830 ; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm30 # 64-byte Reload
7831 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1}
7832 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7833 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1}
7834 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7835 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1}
7836 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7837 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
7838 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7839 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1}
7840 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7841 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1}
7842 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7843 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1}
7844 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7845 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1}
7846 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7847 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1}
7848 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7849 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1}
7850 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7851 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
7852 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7853 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1}
7854 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7855 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1}
7856 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7857 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1}
7858 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7859 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1}
7860 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7861 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1}
7862 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7863 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1}
7864 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7865 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
7866 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7867 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1}
7868 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7869 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
7870 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7871 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1}
7872 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7873 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1}
7874 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7875 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1}
7876 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7877 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1}
7878 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7879 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1}
7880 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7881 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
7882 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 {%k1}
7883 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 1984(%r8)
7884 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 1920(%r8)
7885 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 1856(%r8)
7886 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 1792(%r8)
7887 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 1728(%r8)
7888 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 1664(%r8)
7889 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 1600(%r8)
7890 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 1536(%r8)
7891 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 1472(%r8)
7892 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 1408(%r8)
7893 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 1344(%r8)
7894 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 1280(%r8)
7895 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 1216(%r8)
7896 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 1152(%r8)
7897 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 1088(%r8)
7898 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 1024(%r8)
7899 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 960(%r8)
7900 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 896(%r8)
7901 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 832(%r8)
7902 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 768(%r8)
7903 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 704(%r8)
7904 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 640(%r8)
7905 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 576(%r8)
7906 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 512(%r8)
7907 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 448(%r8)
7908 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 384(%r8)
7909 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 320(%r8)
7910 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7911 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 256(%r8)
7912 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 192(%r8)
7913 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 128(%r8)
7914 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7915 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%r8)
7916 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, (%r8)
7917 ; AVX512DQ-FCP-NEXT: addq $2120, %rsp # imm = 0x848
7918 ; AVX512DQ-FCP-NEXT: vzeroupper
7919 ; AVX512DQ-FCP-NEXT: retq
7921 ; AVX512BW-LABEL: store_i64_stride4_vf64:
7922 ; AVX512BW: # %bb.0:
7923 ; AVX512BW-NEXT: subq $2120, %rsp # imm = 0x848
7924 ; AVX512BW-NEXT: vmovdqa64 384(%rdx), %zmm6
7925 ; AVX512BW-NEXT: vmovdqa64 320(%rdx), %zmm7
7926 ; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm8
7927 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm14
7928 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm12
7929 ; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm11
7930 ; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm10
7931 ; AVX512BW-NEXT: vmovdqa64 384(%rcx), %zmm23
7932 ; AVX512BW-NEXT: vmovdqa64 320(%rcx), %zmm3
7933 ; AVX512BW-NEXT: vmovdqa64 256(%rcx), %zmm0
7934 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm4
7935 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm5
7936 ; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm2
7937 ; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm1
7938 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9]
7939 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm13
7940 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm13
7941 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7942 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,3,11]
7943 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15
7944 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm15
7945 ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7946 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13]
7947 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm16
7948 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm16
7949 ; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7950 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15]
7951 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm26, %zmm14
7952 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7953 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4
7954 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm9, %zmm12
7955 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7956 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12
7957 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm13, %zmm12
7958 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7959 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12
7960 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm12
7961 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7962 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm26, %zmm4
7963 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7964 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4
7965 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm9, %zmm4
7966 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7967 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4
7968 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm4
7969 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7970 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4
7971 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm4
7972 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7973 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm26, %zmm11
7974 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7975 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2
7976 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm2
7977 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7978 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2
7979 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm2
7980 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7981 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2
7982 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm15, %zmm2
7983 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7984 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm10
7985 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7986 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1
7987 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm1
7988 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7989 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1
7990 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1
7991 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7992 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1
7993 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm1
7994 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7995 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm8
7996 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7997 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0
7998 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm9, %zmm0
7999 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8000 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0
8001 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm13, %zmm0
8002 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8003 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0
8004 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm15, %zmm0
8005 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8006 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm26, %zmm7
8007 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8008 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0
8009 ; AVX512BW-NEXT: vpermt2q %zmm23, %zmm9, %zmm0
8010 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8011 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0
8012 ; AVX512BW-NEXT: vpermt2q %zmm23, %zmm13, %zmm0
8013 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8014 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0
8015 ; AVX512BW-NEXT: vpermt2q %zmm23, %zmm15, %zmm0
8016 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8017 ; AVX512BW-NEXT: vpermt2q %zmm23, %zmm26, %zmm6
8018 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8019 ; AVX512BW-NEXT: vmovdqa64 448(%rdx), %zmm28
8020 ; AVX512BW-NEXT: vmovdqa64 448(%rcx), %zmm0
8021 ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm28, %zmm9
8022 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8023 ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm28, %zmm13
8024 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8025 ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm28, %zmm15
8026 ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8027 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm28
8028 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm26
8029 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm3
8030 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,1,9,0,0]
8031 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0
8032 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0
8033 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30
8034 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,10,0,0,3,11,0,0]
8035 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0
8036 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0
8037 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8038 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,12,0,0,5,13,0,0]
8039 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0
8040 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0
8041 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8042 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [6,14,0,0,7,15,0,0]
8043 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm22, %zmm26
8044 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm20
8045 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3
8046 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0
8047 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0
8048 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8049 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0
8050 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0
8051 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
8052 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm29
8053 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm29
8054 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm22, %zmm20
8055 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm18
8056 ; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm3
8057 ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm27
8058 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm27
8059 ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm25
8060 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm25
8061 ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm24
8062 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm24
8063 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm22, %zmm18
8064 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm19
8065 ; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm4
8066 ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm23
8067 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm23
8068 ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm21
8069 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm21
8070 ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm17
8071 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm17
8072 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm22, %zmm19
8073 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm4
8074 ; AVX512BW-NEXT: vmovdqa64 256(%rsi), %zmm6
8075 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm16
8076 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm16
8077 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15
8078 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm15
8079 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14
8080 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm14
8081 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm22, %zmm4
8082 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm6
8083 ; AVX512BW-NEXT: vmovdqa64 320(%rsi), %zmm0
8084 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13
8085 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm13
8086 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12
8087 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm12
8088 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11
8089 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm11
8090 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm6
8091 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm9
8092 ; AVX512BW-NEXT: vmovdqa64 384(%rsi), %zmm0
8093 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10
8094 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm10
8095 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8
8096 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm8
8097 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3
8098 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm3
8099 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm9
8100 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm1
8101 ; AVX512BW-NEXT: vmovdqa64 448(%rsi), %zmm0
8102 ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm7
8103 ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm5
8104 ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
8105 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm1
8106 ; AVX512BW-NEXT: movb $-52, %al
8107 ; AVX512BW-NEXT: kmovd %eax, %k1
8108 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8109 ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm22
8110 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
8111 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8112 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
8113 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1}
8114 ; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8115 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8116 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
8117 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1}
8118 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8119 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1}
8120 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8121 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
8122 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1}
8123 ; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8124 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8125 ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm30 # 64-byte Reload
8126 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1}
8127 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8128 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1}
8129 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8130 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1}
8131 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8132 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
8133 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8134 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1}
8135 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8136 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1}
8137 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8138 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1}
8139 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8140 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1}
8141 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8142 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1}
8143 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8144 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1}
8145 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8146 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
8147 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8148 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1}
8149 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8150 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1}
8151 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8152 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1}
8153 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8154 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1}
8155 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8156 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1}
8157 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8158 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1}
8159 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8160 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
8161 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8162 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1}
8163 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8164 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
8165 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8166 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1}
8167 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8168 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1}
8169 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8170 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1}
8171 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8172 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1}
8173 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8174 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1}
8175 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8176 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
8177 ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 {%k1}
8178 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 1984(%r8)
8179 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 1920(%r8)
8180 ; AVX512BW-NEXT: vmovdqa64 %zmm5, 1856(%r8)
8181 ; AVX512BW-NEXT: vmovdqa64 %zmm7, 1792(%r8)
8182 ; AVX512BW-NEXT: vmovdqa64 %zmm9, 1728(%r8)
8183 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 1664(%r8)
8184 ; AVX512BW-NEXT: vmovdqa64 %zmm8, 1600(%r8)
8185 ; AVX512BW-NEXT: vmovdqa64 %zmm10, 1536(%r8)
8186 ; AVX512BW-NEXT: vmovdqa64 %zmm6, 1472(%r8)
8187 ; AVX512BW-NEXT: vmovdqa64 %zmm11, 1408(%r8)
8188 ; AVX512BW-NEXT: vmovdqa64 %zmm12, 1344(%r8)
8189 ; AVX512BW-NEXT: vmovdqa64 %zmm13, 1280(%r8)
8190 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 1216(%r8)
8191 ; AVX512BW-NEXT: vmovdqa64 %zmm14, 1152(%r8)
8192 ; AVX512BW-NEXT: vmovdqa64 %zmm15, 1088(%r8)
8193 ; AVX512BW-NEXT: vmovdqa64 %zmm16, 1024(%r8)
8194 ; AVX512BW-NEXT: vmovdqa64 %zmm19, 960(%r8)
8195 ; AVX512BW-NEXT: vmovdqa64 %zmm17, 896(%r8)
8196 ; AVX512BW-NEXT: vmovdqa64 %zmm21, 832(%r8)
8197 ; AVX512BW-NEXT: vmovdqa64 %zmm23, 768(%r8)
8198 ; AVX512BW-NEXT: vmovdqa64 %zmm18, 704(%r8)
8199 ; AVX512BW-NEXT: vmovdqa64 %zmm24, 640(%r8)
8200 ; AVX512BW-NEXT: vmovdqa64 %zmm25, 576(%r8)
8201 ; AVX512BW-NEXT: vmovdqa64 %zmm27, 512(%r8)
8202 ; AVX512BW-NEXT: vmovdqa64 %zmm20, 448(%r8)
8203 ; AVX512BW-NEXT: vmovdqa64 %zmm29, 384(%r8)
8204 ; AVX512BW-NEXT: vmovdqa64 %zmm30, 320(%r8)
8205 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8206 ; AVX512BW-NEXT: vmovaps %zmm0, 256(%r8)
8207 ; AVX512BW-NEXT: vmovdqa64 %zmm26, 192(%r8)
8208 ; AVX512BW-NEXT: vmovdqa64 %zmm31, 128(%r8)
8209 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8210 ; AVX512BW-NEXT: vmovaps %zmm0, 64(%r8)
8211 ; AVX512BW-NEXT: vmovdqa64 %zmm22, (%r8)
8212 ; AVX512BW-NEXT: addq $2120, %rsp # imm = 0x848
8213 ; AVX512BW-NEXT: vzeroupper
8214 ; AVX512BW-NEXT: retq
8216 ; AVX512BW-FCP-LABEL: store_i64_stride4_vf64:
8217 ; AVX512BW-FCP: # %bb.0:
8218 ; AVX512BW-FCP-NEXT: subq $2120, %rsp # imm = 0x848
8219 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdx), %zmm6
8220 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdx), %zmm7
8221 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdx), %zmm8
8222 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm14
8223 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm12
8224 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm11
8225 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm10
8226 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rcx), %zmm23
8227 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rcx), %zmm3
8228 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rcx), %zmm0
8229 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm4
8230 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm5
8231 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm2
8232 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1
8233 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9]
8234 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm13
8235 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm13
8236 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8237 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,3,11]
8238 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm15
8239 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm15
8240 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8241 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13]
8242 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm16
8243 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm16
8244 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8245 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15]
8246 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm26, %zmm14
8247 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8248 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm4
8249 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm9, %zmm12
8250 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8251 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm12
8252 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm12
8253 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8254 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm12
8255 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm12
8256 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8257 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm26, %zmm4
8258 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8259 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4
8260 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm9, %zmm4
8261 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8262 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4
8263 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm4
8264 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8265 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4
8266 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm4
8267 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8268 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm26, %zmm11
8269 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8270 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2
8271 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm2
8272 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8273 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2
8274 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm2
8275 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8276 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2
8277 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm2
8278 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8279 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm10
8280 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8281 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
8282 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm1
8283 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8284 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
8285 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm1
8286 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8287 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
8288 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm1
8289 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8290 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm8
8291 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8292 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0
8293 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm0
8294 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8295 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0
8296 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm13, %zmm0
8297 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8298 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0
8299 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm15, %zmm0
8300 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8301 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm7
8302 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8303 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0
8304 ; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm0
8305 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8306 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0
8307 ; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm13, %zmm0
8308 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8309 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0
8310 ; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm15, %zmm0
8311 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8312 ; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm26, %zmm6
8313 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8314 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdx), %zmm28
8315 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rcx), %zmm0
8316 ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm28, %zmm9
8317 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8318 ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm28, %zmm13
8319 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8320 ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm28, %zmm15
8321 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8322 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm28
8323 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm26
8324 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm3
8325 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,1,9,0,0]
8326 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0
8327 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm0
8328 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30
8329 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,10,0,0,3,11,0,0]
8330 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0
8331 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm0
8332 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8333 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,12,0,0,5,13,0,0]
8334 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0
8335 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm0
8336 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8337 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [6,14,0,0,7,15,0,0]
8338 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm26
8339 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm20
8340 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3
8341 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0
8342 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm0
8343 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8344 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0
8345 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm0
8346 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
8347 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm29
8348 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm29
8349 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm20
8350 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm18
8351 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm3
8352 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm27
8353 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm27
8354 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm25
8355 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm25
8356 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm24
8357 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm24
8358 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm18
8359 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm19
8360 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm4
8361 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm23
8362 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm23
8363 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm21
8364 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm21
8365 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm17
8366 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm17
8367 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm22, %zmm19
8368 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4
8369 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rsi), %zmm6
8370 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm16
8371 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm16
8372 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm15
8373 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm15
8374 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14
8375 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm14
8376 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm22, %zmm4
8377 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6
8378 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rsi), %zmm0
8379 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13
8380 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm13
8381 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12
8382 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm12
8383 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11
8384 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm11
8385 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm6
8386 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm9
8387 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rsi), %zmm0
8388 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10
8389 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm10
8390 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm8
8391 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm8
8392 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3
8393 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm3
8394 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm9
8395 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1
8396 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rsi), %zmm0
8397 ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm7
8398 ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm5
8399 ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
8400 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm1
8401 ; AVX512BW-FCP-NEXT: movb $-52, %al
8402 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1
8403 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8404 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm22
8405 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
8406 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8407 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
8408 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1}
8409 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8410 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8411 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
8412 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1}
8413 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8414 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1}
8415 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8416 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
8417 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1}
8418 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8419 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8420 ; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm30 # 64-byte Reload
8421 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1}
8422 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8423 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1}
8424 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8425 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1}
8426 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8427 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
8428 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8429 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1}
8430 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8431 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1}
8432 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8433 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1}
8434 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8435 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1}
8436 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8437 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1}
8438 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8439 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1}
8440 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8441 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
8442 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8443 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1}
8444 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8445 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1}
8446 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8447 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1}
8448 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8449 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1}
8450 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8451 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1}
8452 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8453 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1}
8454 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8455 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
8456 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8457 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1}
8458 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8459 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
8460 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8461 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1}
8462 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8463 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1}
8464 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8465 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1}
8466 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8467 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1}
8468 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8469 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1}
8470 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8471 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
8472 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 {%k1}
8473 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 1984(%r8)
8474 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 1920(%r8)
8475 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 1856(%r8)
8476 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 1792(%r8)
8477 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 1728(%r8)
8478 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 1664(%r8)
8479 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 1600(%r8)
8480 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 1536(%r8)
8481 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 1472(%r8)
8482 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 1408(%r8)
8483 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 1344(%r8)
8484 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 1280(%r8)
8485 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 1216(%r8)
8486 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 1152(%r8)
8487 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 1088(%r8)
8488 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 1024(%r8)
8489 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 960(%r8)
8490 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 896(%r8)
8491 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 832(%r8)
8492 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 768(%r8)
8493 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 704(%r8)
8494 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 640(%r8)
8495 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 576(%r8)
8496 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 512(%r8)
8497 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 448(%r8)
8498 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 384(%r8)
8499 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, 320(%r8)
8500 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8501 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 256(%r8)
8502 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 192(%r8)
8503 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, 128(%r8)
8504 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8505 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 64(%r8)
8506 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, (%r8)
8507 ; AVX512BW-FCP-NEXT: addq $2120, %rsp # imm = 0x848
8508 ; AVX512BW-FCP-NEXT: vzeroupper
8509 ; AVX512BW-FCP-NEXT: retq
8511 ; AVX512DQ-BW-LABEL: store_i64_stride4_vf64:
8512 ; AVX512DQ-BW: # %bb.0:
8513 ; AVX512DQ-BW-NEXT: subq $2120, %rsp # imm = 0x848
8514 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdx), %zmm6
8515 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdx), %zmm7
8516 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdx), %zmm8
8517 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm14
8518 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm12
8519 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdx), %zmm11
8520 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm10
8521 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rcx), %zmm23
8522 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rcx), %zmm3
8523 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rcx), %zmm0
8524 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm4
8525 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm5
8526 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm2
8527 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm1
8528 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9]
8529 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm13
8530 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm13
8531 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8532 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,3,11]
8533 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm15
8534 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm15
8535 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8536 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13]
8537 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm16
8538 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm16
8539 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8540 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15]
8541 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm26, %zmm14
8542 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8543 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm4
8544 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm9, %zmm12
8545 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8546 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm12
8547 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm13, %zmm12
8548 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8549 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm12
8550 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm12
8551 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8552 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm26, %zmm4
8553 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8554 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm4
8555 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm9, %zmm4
8556 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8557 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm4
8558 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm4
8559 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8560 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm4
8561 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm4
8562 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8563 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm26, %zmm11
8564 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8565 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm2
8566 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm2
8567 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8568 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm2
8569 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm2
8570 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8571 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm2
8572 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm15, %zmm2
8573 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8574 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm10
8575 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8576 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1
8577 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm1
8578 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8579 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1
8580 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1
8581 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8582 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1
8583 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm1
8584 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8585 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm8
8586 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8587 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm0
8588 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm9, %zmm0
8589 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8590 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm0
8591 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm13, %zmm0
8592 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8593 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm0
8594 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm15, %zmm0
8595 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8596 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm26, %zmm7
8597 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8598 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm0
8599 ; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm9, %zmm0
8600 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8601 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm0
8602 ; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm13, %zmm0
8603 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8604 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm0
8605 ; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm15, %zmm0
8606 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8607 ; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm26, %zmm6
8608 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8609 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdx), %zmm28
8610 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rcx), %zmm0
8611 ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm28, %zmm9
8612 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8613 ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm28, %zmm13
8614 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8615 ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm28, %zmm15
8616 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8617 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm28
8618 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm26
8619 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm3
8620 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,1,9,0,0]
8621 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm0
8622 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0
8623 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm30
8624 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,10,0,0,3,11,0,0]
8625 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm0
8626 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0
8627 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8628 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,12,0,0,5,13,0,0]
8629 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm0
8630 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0
8631 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8632 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [6,14,0,0,7,15,0,0]
8633 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm22, %zmm26
8634 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm20
8635 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm3
8636 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm0
8637 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0
8638 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8639 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm0
8640 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0
8641 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
8642 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm29
8643 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm29
8644 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm22, %zmm20
8645 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm18
8646 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm3
8647 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm27
8648 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm27
8649 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm25
8650 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm25
8651 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm24
8652 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm24
8653 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm22, %zmm18
8654 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm19
8655 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm4
8656 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm23
8657 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm23
8658 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm21
8659 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm21
8660 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm17
8661 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm17
8662 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm22, %zmm19
8663 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm4
8664 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rsi), %zmm6
8665 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm16
8666 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm16
8667 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm15
8668 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm15
8669 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm14
8670 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm14
8671 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm22, %zmm4
8672 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm6
8673 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rsi), %zmm0
8674 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm13
8675 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm13
8676 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm12
8677 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm12
8678 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm11
8679 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm11
8680 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm6
8681 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm9
8682 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rsi), %zmm0
8683 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm10
8684 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm10
8685 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm8
8686 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm8
8687 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm3
8688 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm3
8689 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm9
8690 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm1
8691 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rsi), %zmm0
8692 ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm7
8693 ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm5
8694 ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
8695 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm1
8696 ; AVX512DQ-BW-NEXT: movb $-52, %al
8697 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1
8698 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8699 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm22
8700 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
8701 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8702 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
8703 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1}
8704 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8705 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8706 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
8707 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1}
8708 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8709 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1}
8710 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8711 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
8712 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1}
8713 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8714 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8715 ; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm30 # 64-byte Reload
8716 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1}
8717 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8718 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1}
8719 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8720 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1}
8721 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8722 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
8723 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8724 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1}
8725 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8726 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1}
8727 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8728 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1}
8729 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8730 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1}
8731 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8732 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1}
8733 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8734 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1}
8735 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8736 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
8737 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8738 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1}
8739 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8740 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1}
8741 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8742 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1}
8743 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8744 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1}
8745 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8746 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1}
8747 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8748 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1}
8749 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8750 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
8751 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8752 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1}
8753 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8754 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
8755 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8756 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1}
8757 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8758 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1}
8759 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8760 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1}
8761 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8762 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1}
8763 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8764 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1}
8765 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8766 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
8767 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm1 {%k1}
8768 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 1984(%r8)
8769 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 1920(%r8)
8770 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 1856(%r8)
8771 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 1792(%r8)
8772 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 1728(%r8)
8773 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 1664(%r8)
8774 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 1600(%r8)
8775 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 1536(%r8)
8776 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 1472(%r8)
8777 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 1408(%r8)
8778 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 1344(%r8)
8779 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 1280(%r8)
8780 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 1216(%r8)
8781 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 1152(%r8)
8782 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 1088(%r8)
8783 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 1024(%r8)
8784 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 960(%r8)
8785 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 896(%r8)
8786 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 832(%r8)
8787 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 768(%r8)
8788 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 704(%r8)
8789 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 640(%r8)
8790 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 576(%r8)
8791 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 512(%r8)
8792 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 448(%r8)
8793 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 384(%r8)
8794 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, 320(%r8)
8795 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8796 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 256(%r8)
8797 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 192(%r8)
8798 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, 128(%r8)
8799 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8800 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 64(%r8)
8801 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, (%r8)
8802 ; AVX512DQ-BW-NEXT: addq $2120, %rsp # imm = 0x848
8803 ; AVX512DQ-BW-NEXT: vzeroupper
8804 ; AVX512DQ-BW-NEXT: retq
8806 ; AVX512DQ-BW-FCP-LABEL: store_i64_stride4_vf64:
8807 ; AVX512DQ-BW-FCP: # %bb.0:
8808 ; AVX512DQ-BW-FCP-NEXT: subq $2120, %rsp # imm = 0x848
8809 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdx), %zmm6
8810 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdx), %zmm7
8811 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdx), %zmm8
8812 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm14
8813 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm12
8814 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm11
8815 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm10
8816 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rcx), %zmm23
8817 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rcx), %zmm3
8818 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rcx), %zmm0
8819 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm4
8820 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm5
8821 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm2
8822 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1
8823 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9]
8824 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm13
8825 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm13
8826 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8827 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,3,11]
8828 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm15
8829 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm15
8830 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8831 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13]
8832 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm16
8833 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm16
8834 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8835 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15]
8836 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm26, %zmm14
8837 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8838 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm4
8839 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm9, %zmm12
8840 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8841 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm12
8842 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm12
8843 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8844 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm12
8845 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm12
8846 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8847 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm26, %zmm4
8848 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8849 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4
8850 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm9, %zmm4
8851 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8852 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4
8853 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm4
8854 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8855 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4
8856 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm4
8857 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8858 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm26, %zmm11
8859 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8860 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2
8861 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm2
8862 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8863 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2
8864 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm2
8865 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8866 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2
8867 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm2
8868 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8869 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm10
8870 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8871 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
8872 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm1
8873 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8874 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
8875 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm1
8876 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8877 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
8878 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm1
8879 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8880 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm8
8881 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8882 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0
8883 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm0
8884 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8885 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0
8886 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm13, %zmm0
8887 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8888 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0
8889 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm15, %zmm0
8890 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8891 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm7
8892 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8893 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0
8894 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm0
8895 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8896 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0
8897 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm13, %zmm0
8898 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8899 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0
8900 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm15, %zmm0
8901 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8902 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm26, %zmm6
8903 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8904 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdx), %zmm28
8905 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rcx), %zmm0
8906 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm28, %zmm9
8907 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8908 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm28, %zmm13
8909 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8910 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm28, %zmm15
8911 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8912 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm28
8913 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm26
8914 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm3
8915 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,1,9,0,0]
8916 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0
8917 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm0
8918 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30
8919 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,10,0,0,3,11,0,0]
8920 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0
8921 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm0
8922 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8923 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,12,0,0,5,13,0,0]
8924 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0
8925 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm0
8926 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8927 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [6,14,0,0,7,15,0,0]
8928 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm26
8929 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm20
8930 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3
8931 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0
8932 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm0
8933 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8934 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0
8935 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm0
8936 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
8937 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm29
8938 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm29
8939 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm20
8940 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm18
8941 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm3
8942 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm27
8943 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm27
8944 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm25
8945 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm25
8946 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm24
8947 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm24
8948 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm18
8949 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm19
8950 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm4
8951 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm23
8952 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm23
8953 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm21
8954 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm21
8955 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm17
8956 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm17
8957 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm22, %zmm19
8958 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4
8959 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rsi), %zmm6
8960 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm16
8961 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm16
8962 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm15
8963 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm15
8964 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14
8965 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm14
8966 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm22, %zmm4
8967 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6
8968 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rsi), %zmm0
8969 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13
8970 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm13
8971 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12
8972 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm12
8973 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11
8974 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm11
8975 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm6
8976 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm9
8977 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rsi), %zmm0
8978 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10
8979 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm10
8980 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm8
8981 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm8
8982 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3
8983 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm3
8984 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm9
8985 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1
8986 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rsi), %zmm0
8987 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm7
8988 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm5
8989 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
8990 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm1
8991 ; AVX512DQ-BW-FCP-NEXT: movb $-52, %al
8992 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
8993 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8994 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm22
8995 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
8996 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8997 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
8998 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1}
8999 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9000 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9001 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
9002 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1}
9003 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9004 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1}
9005 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9006 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
9007 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1}
9008 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9009 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9010 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm30 # 64-byte Reload
9011 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1}
9012 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9013 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1}
9014 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9015 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1}
9016 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9017 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
9018 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9019 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1}
9020 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9021 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1}
9022 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9023 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1}
9024 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9025 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1}
9026 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9027 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1}
9028 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9029 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1}
9030 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9031 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
9032 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9033 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1}
9034 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9035 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1}
9036 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9037 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1}
9038 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9039 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1}
9040 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9041 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1}
9042 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9043 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1}
9044 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9045 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
9046 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9047 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1}
9048 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9049 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
9050 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9051 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1}
9052 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9053 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1}
9054 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9055 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1}
9056 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9057 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1}
9058 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9059 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1}
9060 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9061 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
9062 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 {%k1}
9063 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 1984(%r8)
9064 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 1920(%r8)
9065 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 1856(%r8)
9066 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 1792(%r8)
9067 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 1728(%r8)
9068 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 1664(%r8)
9069 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 1600(%r8)
9070 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 1536(%r8)
9071 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 1472(%r8)
9072 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 1408(%r8)
9073 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 1344(%r8)
9074 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 1280(%r8)
9075 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 1216(%r8)
9076 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 1152(%r8)
9077 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 1088(%r8)
9078 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 1024(%r8)
9079 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 960(%r8)
9080 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 896(%r8)
9081 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 832(%r8)
9082 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 768(%r8)
9083 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 704(%r8)
9084 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 640(%r8)
9085 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 576(%r8)
9086 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 512(%r8)
9087 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 448(%r8)
9088 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 384(%r8)
9089 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, 320(%r8)
9090 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9091 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 256(%r8)
9092 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 192(%r8)
9093 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, 128(%r8)
9094 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9095 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 64(%r8)
9096 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, (%r8)
9097 ; AVX512DQ-BW-FCP-NEXT: addq $2120, %rsp # imm = 0x848
9098 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
9099 ; AVX512DQ-BW-FCP-NEXT: retq
9100 %in.vec0 = load <64 x i64>, ptr %in.vecptr0, align 64
9101 %in.vec1 = load <64 x i64>, ptr %in.vecptr1, align 64
9102 %in.vec2 = load <64 x i64>, ptr %in.vecptr2, align 64
9103 %in.vec3 = load <64 x i64>, ptr %in.vecptr3, align 64
9104 %1 = shufflevector <64 x i64> %in.vec0, <64 x i64> %in.vec1, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
9105 %2 = shufflevector <64 x i64> %in.vec2, <64 x i64> %in.vec3, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
9106 %3 = shufflevector <128 x i64> %1, <128 x i64> %2, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255>
9107 %interleaved.vec = shufflevector <256 x i64> %3, <256 x i64> poison, <256 x i32> <i32 0, i32 64, i32 128, i32 192, i32 1, i32 65, i32 129, i32 193, i32 2, i32 66, i32 130, i32 194, i32 3, i32 67, i32 131, i32 195, i32 4, i32 68, i32 132, i32 196, i32 5, i32 69, i32 133, i32 197, i32 6, i32 70, i32 134, i32 198, i32 7, i32 71, i32 135, i32 199, i32 8, i32 72, i32 136, i32 200, i32 9, i32 73, i32 137, i32 201, i32 10, i32 74, i32 138, i32 202, i32 11, i32 75, i32 139, i32 203, i32 12, i32 76, i32 140, i32 204, i32 13, i32 77, i32 141, i32 205, i32 14, i32 78, i32 142, i32 206, i32 15, i32 79, i32 143, i32 207, i32 16, i32 80, i32 144, i32 208, i32 17, i32 81, i32 145, i32 209, i32 18, i32 82, i32 146, i32 210, i32 19, i32 83, i32 147, i32 211, i32 20, i32 84, i32 148, i32 212, i32 21, i32 85, i32 149, i32 213, i32 22, i32 86, i32 150, i32 214, i32 23, i32 87, i32 151, i32 215, i32 24, i32 88, i32 152, i32 216, i32 25, i32 89, i32 153, i32 217, i32 26, i32 90, i32 154, i32 218, i32 27, i32 91, i32 155, i32 219, i32 28, i32 92, i32 156, i32 220, i32 29, i32 93, i32 157, i32 221, i32 30, i32 94, i32 158, i32 222, i32 31, i32 95, i32 159, i32 223, i32 32, i32 96, i32 160, i32 224, i32 33, i32 97, i32 161, i32 225, i32 34, i32 98, i32 162, i32 226, i32 35, i32 99, i32 163, i32 227, i32 36, i32 100, i32 164, i32 228, i32 37, i32 101, i32 165, i32 229, i32 38, i32 102, i32 166, i32 230, i32 39, i32 103, i32 167, i32 231, i32 40, i32 104, i32 168, i32 232, i32 41, i32 105, i32 169, i32 233, i32 42, i32 106, i32 170, i32 234, i32 43, i32 107, i32 171, i32 235, i32 44, i32 108, i32 172, i32 236, i32 45, i32 109, i32 173, i32 237, i32 46, i32 110, i32 174, i32 238, i32 47, i32 111, i32 175, i32 239, i32 48, i32 112, i32 176, i32 240, i32 49, i32 113, i32 177, i32 241, i32 50, i32 114, i32 178, i32 242, i32 51, i32 115, i32 179, i32 243, i32 52, i32 116, i32 180, i32 244, i32 53, i32 117, i32 181, i32 245, i32 54, i32 118, i32 182, i32 246, i32 55, i32 119, i32 183, i32 247, i32 56, i32 120, i32 184, i32 248, i32 57, i32 121, i32 185, i32 249, i32 58, i32 122, i32 186, i32 250, i32 59, i32 123, i32 187, i32 251, i32 60, i32 124, i32 188, i32 252, i32 61, i32 125, i32 189, i32 253, i32 62, i32 126, i32 190, i32 254, i32 63, i32 127, i32 191, i32 255>
9108 store <256 x i64> %interleaved.vec, ptr %out.vec, align 64