1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
16 ; These patterns are produced by LoopVectorizer for interleaved stores.
18 define void @store_i64_stride2_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
19 ; SSE-LABEL: store_i64_stride2_vf2:
21 ; SSE-NEXT: movaps (%rdi), %xmm0
22 ; SSE-NEXT: movaps (%rsi), %xmm1
23 ; SSE-NEXT: movaps %xmm0, %xmm2
24 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
25 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
26 ; SSE-NEXT: movaps %xmm0, 16(%rdx)
27 ; SSE-NEXT: movaps %xmm2, (%rdx)
30 ; AVX-LABEL: store_i64_stride2_vf2:
32 ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
33 ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
34 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
35 ; AVX-NEXT: vmovapd %ymm0, (%rdx)
36 ; AVX-NEXT: vzeroupper
39 ; AVX2-LABEL: store_i64_stride2_vf2:
41 ; AVX2-NEXT: vmovaps (%rdi), %xmm0
42 ; AVX2-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
43 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
44 ; AVX2-NEXT: vmovaps %ymm0, (%rdx)
45 ; AVX2-NEXT: vzeroupper
48 ; AVX2-FP-LABEL: store_i64_stride2_vf2:
50 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm0
51 ; AVX2-FP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
52 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
53 ; AVX2-FP-NEXT: vmovaps %ymm0, (%rdx)
54 ; AVX2-FP-NEXT: vzeroupper
57 ; AVX2-FCP-LABEL: store_i64_stride2_vf2:
59 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm0
60 ; AVX2-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
61 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
62 ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rdx)
63 ; AVX2-FCP-NEXT: vzeroupper
66 ; AVX512-LABEL: store_i64_stride2_vf2:
68 ; AVX512-NEXT: vmovaps (%rdi), %xmm0
69 ; AVX512-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
70 ; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
71 ; AVX512-NEXT: vmovaps %ymm0, (%rdx)
72 ; AVX512-NEXT: vzeroupper
75 ; AVX512-FCP-LABEL: store_i64_stride2_vf2:
76 ; AVX512-FCP: # %bb.0:
77 ; AVX512-FCP-NEXT: vmovaps (%rdi), %xmm0
78 ; AVX512-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
79 ; AVX512-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
80 ; AVX512-FCP-NEXT: vmovaps %ymm0, (%rdx)
81 ; AVX512-FCP-NEXT: vzeroupper
82 ; AVX512-FCP-NEXT: retq
84 ; AVX512DQ-LABEL: store_i64_stride2_vf2:
86 ; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0
87 ; AVX512DQ-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
88 ; AVX512DQ-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
89 ; AVX512DQ-NEXT: vmovaps %ymm0, (%rdx)
90 ; AVX512DQ-NEXT: vzeroupper
93 ; AVX512DQ-FCP-LABEL: store_i64_stride2_vf2:
94 ; AVX512DQ-FCP: # %bb.0:
95 ; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %xmm0
96 ; AVX512DQ-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
97 ; AVX512DQ-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
98 ; AVX512DQ-FCP-NEXT: vmovaps %ymm0, (%rdx)
99 ; AVX512DQ-FCP-NEXT: vzeroupper
100 ; AVX512DQ-FCP-NEXT: retq
102 ; AVX512BW-LABEL: store_i64_stride2_vf2:
104 ; AVX512BW-NEXT: vmovaps (%rdi), %xmm0
105 ; AVX512BW-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
106 ; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
107 ; AVX512BW-NEXT: vmovaps %ymm0, (%rdx)
108 ; AVX512BW-NEXT: vzeroupper
109 ; AVX512BW-NEXT: retq
111 ; AVX512BW-FCP-LABEL: store_i64_stride2_vf2:
112 ; AVX512BW-FCP: # %bb.0:
113 ; AVX512BW-FCP-NEXT: vmovaps (%rdi), %xmm0
114 ; AVX512BW-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
115 ; AVX512BW-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
116 ; AVX512BW-FCP-NEXT: vmovaps %ymm0, (%rdx)
117 ; AVX512BW-FCP-NEXT: vzeroupper
118 ; AVX512BW-FCP-NEXT: retq
120 ; AVX512DQ-BW-LABEL: store_i64_stride2_vf2:
121 ; AVX512DQ-BW: # %bb.0:
122 ; AVX512DQ-BW-NEXT: vmovaps (%rdi), %xmm0
123 ; AVX512DQ-BW-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
124 ; AVX512DQ-BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
125 ; AVX512DQ-BW-NEXT: vmovaps %ymm0, (%rdx)
126 ; AVX512DQ-BW-NEXT: vzeroupper
127 ; AVX512DQ-BW-NEXT: retq
129 ; AVX512DQ-BW-FCP-LABEL: store_i64_stride2_vf2:
130 ; AVX512DQ-BW-FCP: # %bb.0:
131 ; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %xmm0
132 ; AVX512DQ-BW-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
133 ; AVX512DQ-BW-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
134 ; AVX512DQ-BW-FCP-NEXT: vmovaps %ymm0, (%rdx)
135 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
136 ; AVX512DQ-BW-FCP-NEXT: retq
137 %in.vec0 = load <2 x i64>, ptr %in.vecptr0, align 64
138 %in.vec1 = load <2 x i64>, ptr %in.vecptr1, align 64
139 %1 = shufflevector <2 x i64> %in.vec0, <2 x i64> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
140 %interleaved.vec = shufflevector <4 x i64> %1, <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
141 store <4 x i64> %interleaved.vec, ptr %out.vec, align 64
145 define void @store_i64_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
146 ; SSE-LABEL: store_i64_stride2_vf4:
148 ; SSE-NEXT: movaps (%rdi), %xmm0
149 ; SSE-NEXT: movaps 16(%rdi), %xmm1
150 ; SSE-NEXT: movaps (%rsi), %xmm2
151 ; SSE-NEXT: movaps 16(%rsi), %xmm3
152 ; SSE-NEXT: movaps %xmm0, %xmm4
153 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
154 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
155 ; SSE-NEXT: movaps %xmm1, %xmm2
156 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
157 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
158 ; SSE-NEXT: movaps %xmm1, 32(%rdx)
159 ; SSE-NEXT: movaps %xmm2, 48(%rdx)
160 ; SSE-NEXT: movaps %xmm0, (%rdx)
161 ; SSE-NEXT: movaps %xmm4, 16(%rdx)
164 ; AVX-LABEL: store_i64_stride2_vf4:
166 ; AVX-NEXT: vmovaps (%rsi), %xmm0
167 ; AVX-NEXT: vmovaps (%rdi), %xmm1
168 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
169 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
170 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
171 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3]
172 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3,2,3]
173 ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[3],ymm1[3]
174 ; AVX-NEXT: vmovapd %ymm1, 32(%rdx)
175 ; AVX-NEXT: vmovapd %ymm0, (%rdx)
176 ; AVX-NEXT: vzeroupper
179 ; AVX2-LABEL: store_i64_stride2_vf4:
181 ; AVX2-NEXT: vmovaps (%rdi), %ymm0
182 ; AVX2-NEXT: vmovaps (%rsi), %ymm1
183 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
184 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3]
185 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
186 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
187 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
188 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
189 ; AVX2-NEXT: vmovaps %ymm0, 32(%rdx)
190 ; AVX2-NEXT: vmovaps %ymm2, (%rdx)
191 ; AVX2-NEXT: vzeroupper
194 ; AVX2-FP-LABEL: store_i64_stride2_vf4:
196 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0
197 ; AVX2-FP-NEXT: vmovaps (%rsi), %ymm1
198 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
199 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3]
200 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
201 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
202 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
203 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
204 ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rdx)
205 ; AVX2-FP-NEXT: vmovaps %ymm2, (%rdx)
206 ; AVX2-FP-NEXT: vzeroupper
209 ; AVX2-FCP-LABEL: store_i64_stride2_vf4:
211 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0
212 ; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm1
213 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
214 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3]
215 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
216 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
217 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
218 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
219 ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rdx)
220 ; AVX2-FCP-NEXT: vmovaps %ymm2, (%rdx)
221 ; AVX2-FCP-NEXT: vzeroupper
222 ; AVX2-FCP-NEXT: retq
224 ; AVX512-LABEL: store_i64_stride2_vf4:
226 ; AVX512-NEXT: vmovaps (%rdi), %ymm0
227 ; AVX512-NEXT: vinsertf64x4 $1, (%rsi), %zmm0, %zmm0
228 ; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
229 ; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0
230 ; AVX512-NEXT: vmovaps %zmm0, (%rdx)
231 ; AVX512-NEXT: vzeroupper
234 ; AVX512-FCP-LABEL: store_i64_stride2_vf4:
235 ; AVX512-FCP: # %bb.0:
236 ; AVX512-FCP-NEXT: vmovaps (%rdi), %ymm0
237 ; AVX512-FCP-NEXT: vinsertf64x4 $1, (%rsi), %zmm0, %zmm0
238 ; AVX512-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
239 ; AVX512-FCP-NEXT: vpermpd %zmm0, %zmm1, %zmm0
240 ; AVX512-FCP-NEXT: vmovaps %zmm0, (%rdx)
241 ; AVX512-FCP-NEXT: vzeroupper
242 ; AVX512-FCP-NEXT: retq
244 ; AVX512DQ-LABEL: store_i64_stride2_vf4:
246 ; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0
247 ; AVX512DQ-NEXT: vinsertf64x4 $1, (%rsi), %zmm0, %zmm0
248 ; AVX512DQ-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
249 ; AVX512DQ-NEXT: vpermpd %zmm0, %zmm1, %zmm0
250 ; AVX512DQ-NEXT: vmovaps %zmm0, (%rdx)
251 ; AVX512DQ-NEXT: vzeroupper
252 ; AVX512DQ-NEXT: retq
254 ; AVX512DQ-FCP-LABEL: store_i64_stride2_vf4:
255 ; AVX512DQ-FCP: # %bb.0:
256 ; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %ymm0
257 ; AVX512DQ-FCP-NEXT: vinsertf64x4 $1, (%rsi), %zmm0, %zmm0
258 ; AVX512DQ-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
259 ; AVX512DQ-FCP-NEXT: vpermpd %zmm0, %zmm1, %zmm0
260 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rdx)
261 ; AVX512DQ-FCP-NEXT: vzeroupper
262 ; AVX512DQ-FCP-NEXT: retq
264 ; AVX512BW-LABEL: store_i64_stride2_vf4:
266 ; AVX512BW-NEXT: vmovaps (%rdi), %ymm0
267 ; AVX512BW-NEXT: vinsertf64x4 $1, (%rsi), %zmm0, %zmm0
268 ; AVX512BW-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
269 ; AVX512BW-NEXT: vpermpd %zmm0, %zmm1, %zmm0
270 ; AVX512BW-NEXT: vmovaps %zmm0, (%rdx)
271 ; AVX512BW-NEXT: vzeroupper
272 ; AVX512BW-NEXT: retq
274 ; AVX512BW-FCP-LABEL: store_i64_stride2_vf4:
275 ; AVX512BW-FCP: # %bb.0:
276 ; AVX512BW-FCP-NEXT: vmovaps (%rdi), %ymm0
277 ; AVX512BW-FCP-NEXT: vinsertf64x4 $1, (%rsi), %zmm0, %zmm0
278 ; AVX512BW-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
279 ; AVX512BW-FCP-NEXT: vpermpd %zmm0, %zmm1, %zmm0
280 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%rdx)
281 ; AVX512BW-FCP-NEXT: vzeroupper
282 ; AVX512BW-FCP-NEXT: retq
284 ; AVX512DQ-BW-LABEL: store_i64_stride2_vf4:
285 ; AVX512DQ-BW: # %bb.0:
286 ; AVX512DQ-BW-NEXT: vmovaps (%rdi), %ymm0
287 ; AVX512DQ-BW-NEXT: vinsertf64x4 $1, (%rsi), %zmm0, %zmm0
288 ; AVX512DQ-BW-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
289 ; AVX512DQ-BW-NEXT: vpermpd %zmm0, %zmm1, %zmm0
290 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%rdx)
291 ; AVX512DQ-BW-NEXT: vzeroupper
292 ; AVX512DQ-BW-NEXT: retq
294 ; AVX512DQ-BW-FCP-LABEL: store_i64_stride2_vf4:
295 ; AVX512DQ-BW-FCP: # %bb.0:
296 ; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %ymm0
297 ; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $1, (%rsi), %zmm0, %zmm0
298 ; AVX512DQ-BW-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
299 ; AVX512DQ-BW-FCP-NEXT: vpermpd %zmm0, %zmm1, %zmm0
300 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%rdx)
301 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
302 ; AVX512DQ-BW-FCP-NEXT: retq
303 %in.vec0 = load <4 x i64>, ptr %in.vecptr0, align 64
304 %in.vec1 = load <4 x i64>, ptr %in.vecptr1, align 64
305 %1 = shufflevector <4 x i64> %in.vec0, <4 x i64> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
306 %interleaved.vec = shufflevector <8 x i64> %1, <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
307 store <8 x i64> %interleaved.vec, ptr %out.vec, align 64
311 define void @store_i64_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
312 ; SSE-LABEL: store_i64_stride2_vf8:
314 ; SSE-NEXT: movaps (%rdi), %xmm0
315 ; SSE-NEXT: movaps 16(%rdi), %xmm1
316 ; SSE-NEXT: movaps 32(%rdi), %xmm2
317 ; SSE-NEXT: movaps 48(%rdi), %xmm3
318 ; SSE-NEXT: movaps (%rsi), %xmm4
319 ; SSE-NEXT: movaps 16(%rsi), %xmm5
320 ; SSE-NEXT: movaps 32(%rsi), %xmm6
321 ; SSE-NEXT: movaps 48(%rsi), %xmm7
322 ; SSE-NEXT: movaps %xmm0, %xmm8
323 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1]
324 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
325 ; SSE-NEXT: movaps %xmm1, %xmm4
326 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1]
327 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0]
328 ; SSE-NEXT: movaps %xmm2, %xmm5
329 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1]
330 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm6[0]
331 ; SSE-NEXT: movaps %xmm3, %xmm6
332 ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1]
333 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm7[0]
334 ; SSE-NEXT: movaps %xmm3, 96(%rdx)
335 ; SSE-NEXT: movaps %xmm6, 112(%rdx)
336 ; SSE-NEXT: movaps %xmm2, 64(%rdx)
337 ; SSE-NEXT: movaps %xmm5, 80(%rdx)
338 ; SSE-NEXT: movaps %xmm1, 32(%rdx)
339 ; SSE-NEXT: movaps %xmm4, 48(%rdx)
340 ; SSE-NEXT: movaps %xmm0, (%rdx)
341 ; SSE-NEXT: movaps %xmm8, 16(%rdx)
344 ; AVX-LABEL: store_i64_stride2_vf8:
346 ; AVX-NEXT: vmovaps (%rsi), %xmm0
347 ; AVX-NEXT: vmovaps 32(%rsi), %xmm1
348 ; AVX-NEXT: vmovaps (%rdi), %xmm2
349 ; AVX-NEXT: vmovaps 32(%rdi), %xmm3
350 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm2[1],xmm0[1]
351 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0]
352 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
353 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm1[1]
354 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm1[0]
355 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
356 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3,2,3]
357 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3,2,3]
358 ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[3],ymm2[3]
359 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3,2,3]
360 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = mem[2,3,2,3]
361 ; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[3],ymm3[3]
362 ; AVX-NEXT: vmovapd %ymm3, 96(%rdx)
363 ; AVX-NEXT: vmovapd %ymm2, 32(%rdx)
364 ; AVX-NEXT: vmovaps %ymm1, 64(%rdx)
365 ; AVX-NEXT: vmovapd %ymm0, (%rdx)
366 ; AVX-NEXT: vzeroupper
369 ; AVX2-LABEL: store_i64_stride2_vf8:
371 ; AVX2-NEXT: vmovaps (%rdi), %ymm0
372 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm1
373 ; AVX2-NEXT: vmovaps (%rsi), %ymm2
374 ; AVX2-NEXT: vmovaps 32(%rsi), %ymm3
375 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[0,2,2,3]
376 ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[2,1,3,3]
377 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
378 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
379 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
380 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
381 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[0,2,2,3]
382 ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm1[2,1,3,3]
383 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
384 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1]
385 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
386 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7]
387 ; AVX2-NEXT: vmovaps %ymm1, 64(%rdx)
388 ; AVX2-NEXT: vmovaps %ymm2, 96(%rdx)
389 ; AVX2-NEXT: vmovaps %ymm0, (%rdx)
390 ; AVX2-NEXT: vmovaps %ymm4, 32(%rdx)
391 ; AVX2-NEXT: vzeroupper
394 ; AVX2-FP-LABEL: store_i64_stride2_vf8:
396 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0
397 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm1
398 ; AVX2-FP-NEXT: vmovaps (%rsi), %ymm2
399 ; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm3
400 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[0,2,2,3]
401 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[2,1,3,3]
402 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
403 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
404 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
405 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
406 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[0,2,2,3]
407 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm1[2,1,3,3]
408 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
409 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1]
410 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
411 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7]
412 ; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rdx)
413 ; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rdx)
414 ; AVX2-FP-NEXT: vmovaps %ymm0, (%rdx)
415 ; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rdx)
416 ; AVX2-FP-NEXT: vzeroupper
419 ; AVX2-FCP-LABEL: store_i64_stride2_vf8:
421 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0
422 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm1
423 ; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm2
424 ; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm3
425 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[0,2,2,3]
426 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[2,1,3,3]
427 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
428 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
429 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
430 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
431 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[0,2,2,3]
432 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm1[2,1,3,3]
433 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
434 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1]
435 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
436 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7]
437 ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rdx)
438 ; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rdx)
439 ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rdx)
440 ; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rdx)
441 ; AVX2-FCP-NEXT: vzeroupper
442 ; AVX2-FCP-NEXT: retq
444 ; AVX512-LABEL: store_i64_stride2_vf8:
446 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
447 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm1
448 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11]
449 ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
450 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15]
451 ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
452 ; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rdx)
453 ; AVX512-NEXT: vmovdqa64 %zmm2, (%rdx)
454 ; AVX512-NEXT: vzeroupper
457 ; AVX512-FCP-LABEL: store_i64_stride2_vf8:
458 ; AVX512-FCP: # %bb.0:
459 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
460 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm1
461 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11]
462 ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
463 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15]
464 ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
465 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%rdx)
466 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rdx)
467 ; AVX512-FCP-NEXT: vzeroupper
468 ; AVX512-FCP-NEXT: retq
470 ; AVX512DQ-LABEL: store_i64_stride2_vf8:
472 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0
473 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm1
474 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11]
475 ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
476 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15]
477 ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
478 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%rdx)
479 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rdx)
480 ; AVX512DQ-NEXT: vzeroupper
481 ; AVX512DQ-NEXT: retq
483 ; AVX512DQ-FCP-LABEL: store_i64_stride2_vf8:
484 ; AVX512DQ-FCP: # %bb.0:
485 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
486 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm1
487 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11]
488 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
489 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15]
490 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
491 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%rdx)
492 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rdx)
493 ; AVX512DQ-FCP-NEXT: vzeroupper
494 ; AVX512DQ-FCP-NEXT: retq
496 ; AVX512BW-LABEL: store_i64_stride2_vf8:
498 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
499 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1
500 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11]
501 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
502 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15]
503 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
504 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rdx)
505 ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rdx)
506 ; AVX512BW-NEXT: vzeroupper
507 ; AVX512BW-NEXT: retq
509 ; AVX512BW-FCP-LABEL: store_i64_stride2_vf8:
510 ; AVX512BW-FCP: # %bb.0:
511 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
512 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1
513 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11]
514 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
515 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15]
516 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
517 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rdx)
518 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rdx)
519 ; AVX512BW-FCP-NEXT: vzeroupper
520 ; AVX512BW-FCP-NEXT: retq
522 ; AVX512DQ-BW-LABEL: store_i64_stride2_vf8:
523 ; AVX512DQ-BW: # %bb.0:
524 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
525 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm1
526 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11]
527 ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
528 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15]
529 ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
530 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 64(%rdx)
531 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rdx)
532 ; AVX512DQ-BW-NEXT: vzeroupper
533 ; AVX512DQ-BW-NEXT: retq
535 ; AVX512DQ-BW-FCP-LABEL: store_i64_stride2_vf8:
536 ; AVX512DQ-BW-FCP: # %bb.0:
537 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
538 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1
539 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11]
540 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
541 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15]
542 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
543 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rdx)
544 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rdx)
545 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
546 ; AVX512DQ-BW-FCP-NEXT: retq
547 %in.vec0 = load <8 x i64>, ptr %in.vecptr0, align 64
548 %in.vec1 = load <8 x i64>, ptr %in.vecptr1, align 64
549 %1 = shufflevector <8 x i64> %in.vec0, <8 x i64> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
550 %interleaved.vec = shufflevector <16 x i64> %1, <16 x i64> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
551 store <16 x i64> %interleaved.vec, ptr %out.vec, align 64
555 define void @store_i64_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
556 ; SSE-LABEL: store_i64_stride2_vf16:
558 ; SSE-NEXT: movaps 112(%rdi), %xmm0
559 ; SSE-NEXT: movaps 96(%rdi), %xmm6
560 ; SSE-NEXT: movaps 80(%rdi), %xmm4
561 ; SSE-NEXT: movaps 64(%rdi), %xmm3
562 ; SSE-NEXT: movaps (%rdi), %xmm8
563 ; SSE-NEXT: movaps 16(%rdi), %xmm1
564 ; SSE-NEXT: movaps 32(%rdi), %xmm2
565 ; SSE-NEXT: movaps 48(%rdi), %xmm5
566 ; SSE-NEXT: movaps 96(%rsi), %xmm11
567 ; SSE-NEXT: movaps 80(%rsi), %xmm12
568 ; SSE-NEXT: movaps 64(%rsi), %xmm13
569 ; SSE-NEXT: movaps (%rsi), %xmm9
570 ; SSE-NEXT: movaps 16(%rsi), %xmm10
571 ; SSE-NEXT: movaps 32(%rsi), %xmm14
572 ; SSE-NEXT: movaps 48(%rsi), %xmm15
573 ; SSE-NEXT: movaps %xmm8, %xmm7
574 ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1]
575 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
576 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm9[0]
577 ; SSE-NEXT: movaps %xmm1, %xmm9
578 ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1]
579 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm10[0]
580 ; SSE-NEXT: movaps %xmm2, %xmm10
581 ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm14[1]
582 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm14[0]
583 ; SSE-NEXT: movaps %xmm5, %xmm14
584 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm15[1]
585 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm15[0]
586 ; SSE-NEXT: movaps %xmm3, %xmm15
587 ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm13[1]
588 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm13[0]
589 ; SSE-NEXT: movaps %xmm4, %xmm13
590 ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm12[1]
591 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm12[0]
592 ; SSE-NEXT: movaps %xmm6, %xmm12
593 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1]
594 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm11[0]
595 ; SSE-NEXT: movaps 112(%rsi), %xmm11
596 ; SSE-NEXT: movaps %xmm0, %xmm7
597 ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm11[1]
598 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm11[0]
599 ; SSE-NEXT: movaps %xmm0, 224(%rdx)
600 ; SSE-NEXT: movaps %xmm7, 240(%rdx)
601 ; SSE-NEXT: movaps %xmm6, 192(%rdx)
602 ; SSE-NEXT: movaps %xmm12, 208(%rdx)
603 ; SSE-NEXT: movaps %xmm4, 160(%rdx)
604 ; SSE-NEXT: movaps %xmm13, 176(%rdx)
605 ; SSE-NEXT: movaps %xmm3, 128(%rdx)
606 ; SSE-NEXT: movaps %xmm15, 144(%rdx)
607 ; SSE-NEXT: movaps %xmm5, 96(%rdx)
608 ; SSE-NEXT: movaps %xmm14, 112(%rdx)
609 ; SSE-NEXT: movaps %xmm2, 64(%rdx)
610 ; SSE-NEXT: movaps %xmm10, 80(%rdx)
611 ; SSE-NEXT: movaps %xmm1, 32(%rdx)
612 ; SSE-NEXT: movaps %xmm9, 48(%rdx)
613 ; SSE-NEXT: movaps %xmm8, (%rdx)
614 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
615 ; SSE-NEXT: movaps %xmm0, 16(%rdx)
618 ; AVX-LABEL: store_i64_stride2_vf16:
620 ; AVX-NEXT: vmovaps (%rsi), %xmm0
621 ; AVX-NEXT: vmovaps 32(%rsi), %xmm1
622 ; AVX-NEXT: vmovaps 64(%rsi), %xmm2
623 ; AVX-NEXT: vmovaps 96(%rsi), %xmm3
624 ; AVX-NEXT: vmovaps (%rdi), %xmm4
625 ; AVX-NEXT: vmovaps 32(%rdi), %xmm5
626 ; AVX-NEXT: vmovaps 64(%rdi), %xmm6
627 ; AVX-NEXT: vmovaps 96(%rdi), %xmm7
628 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm7[1],xmm3[1]
629 ; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm7[0],xmm3[0]
630 ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3
631 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm6[1],xmm2[1]
632 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm2[0]
633 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2
634 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm4[1],xmm0[1]
635 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm0[0]
636 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
637 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm1[1]
638 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm1[0]
639 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
640 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = mem[2,3,2,3]
641 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3,2,3]
642 ; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[3],ymm4[3]
643 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3,2,3]
644 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3,2,3]
645 ; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[3],ymm5[3]
646 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3,2,3]
647 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = mem[2,3,2,3]
648 ; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[3],ymm6[3]
649 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = mem[2,3,2,3]
650 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3,2,3]
651 ; AVX-NEXT: vshufpd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[3],ymm7[3]
652 ; AVX-NEXT: vmovapd %ymm7, 32(%rdx)
653 ; AVX-NEXT: vmovapd %ymm6, 96(%rdx)
654 ; AVX-NEXT: vmovapd %ymm5, 160(%rdx)
655 ; AVX-NEXT: vmovapd %ymm4, 224(%rdx)
656 ; AVX-NEXT: vmovaps %ymm1, 64(%rdx)
657 ; AVX-NEXT: vmovapd %ymm0, (%rdx)
658 ; AVX-NEXT: vmovaps %ymm2, 128(%rdx)
659 ; AVX-NEXT: vmovaps %ymm3, 192(%rdx)
660 ; AVX-NEXT: vzeroupper
663 ; AVX2-LABEL: store_i64_stride2_vf16:
665 ; AVX2-NEXT: vmovaps (%rdi), %ymm0
666 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm1
667 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm2
668 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm3
669 ; AVX2-NEXT: vmovaps (%rsi), %ymm4
670 ; AVX2-NEXT: vmovaps 32(%rsi), %ymm5
671 ; AVX2-NEXT: vmovaps 64(%rsi), %ymm6
672 ; AVX2-NEXT: vmovaps 96(%rsi), %ymm7
673 ; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm4[0,2,2,3]
674 ; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[2,1,3,3]
675 ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7]
676 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1]
677 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
678 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7]
679 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm5[0,2,2,3]
680 ; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm1[2,1,3,3]
681 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3],ymm9[4,5],ymm4[6,7]
682 ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,0,2,1]
683 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
684 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3],ymm1[4,5],ymm5[6,7]
685 ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm6[0,2,2,3]
686 ; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm2[2,1,3,3]
687 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7]
688 ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1]
689 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,1,3]
690 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3],ymm2[4,5],ymm6[6,7]
691 ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm7[0,2,2,3]
692 ; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm3[2,1,3,3]
693 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7]
694 ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1]
695 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3]
696 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3],ymm3[4,5],ymm7[6,7]
697 ; AVX2-NEXT: vmovaps %ymm3, 192(%rdx)
698 ; AVX2-NEXT: vmovaps %ymm6, 224(%rdx)
699 ; AVX2-NEXT: vmovaps %ymm2, 128(%rdx)
700 ; AVX2-NEXT: vmovaps %ymm5, 160(%rdx)
701 ; AVX2-NEXT: vmovaps %ymm1, 64(%rdx)
702 ; AVX2-NEXT: vmovaps %ymm4, 96(%rdx)
703 ; AVX2-NEXT: vmovaps %ymm0, (%rdx)
704 ; AVX2-NEXT: vmovaps %ymm8, 32(%rdx)
705 ; AVX2-NEXT: vzeroupper
708 ; AVX2-FP-LABEL: store_i64_stride2_vf16:
710 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0
711 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm1
712 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm2
713 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm3
714 ; AVX2-FP-NEXT: vmovaps (%rsi), %ymm4
715 ; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm5
716 ; AVX2-FP-NEXT: vmovaps 64(%rsi), %ymm6
717 ; AVX2-FP-NEXT: vmovaps 96(%rsi), %ymm7
718 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm4[0,2,2,3]
719 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[2,1,3,3]
720 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7]
721 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1]
722 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
723 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7]
724 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm5[0,2,2,3]
725 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm1[2,1,3,3]
726 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3],ymm9[4,5],ymm4[6,7]
727 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,0,2,1]
728 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
729 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3],ymm1[4,5],ymm5[6,7]
730 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm6[0,2,2,3]
731 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm2[2,1,3,3]
732 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7]
733 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1]
734 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,1,3]
735 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3],ymm2[4,5],ymm6[6,7]
736 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm7[0,2,2,3]
737 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm3[2,1,3,3]
738 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7]
739 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1]
740 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3]
741 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3],ymm3[4,5],ymm7[6,7]
742 ; AVX2-FP-NEXT: vmovaps %ymm3, 192(%rdx)
743 ; AVX2-FP-NEXT: vmovaps %ymm6, 224(%rdx)
744 ; AVX2-FP-NEXT: vmovaps %ymm2, 128(%rdx)
745 ; AVX2-FP-NEXT: vmovaps %ymm5, 160(%rdx)
746 ; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rdx)
747 ; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rdx)
748 ; AVX2-FP-NEXT: vmovaps %ymm0, (%rdx)
749 ; AVX2-FP-NEXT: vmovaps %ymm8, 32(%rdx)
750 ; AVX2-FP-NEXT: vzeroupper
753 ; AVX2-FCP-LABEL: store_i64_stride2_vf16:
755 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0
756 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm1
757 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm2
758 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm3
759 ; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm4
760 ; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm5
761 ; AVX2-FCP-NEXT: vmovaps 64(%rsi), %ymm6
762 ; AVX2-FCP-NEXT: vmovaps 96(%rsi), %ymm7
763 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm4[0,2,2,3]
764 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[2,1,3,3]
765 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7]
766 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1]
767 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
768 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7]
769 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm5[0,2,2,3]
770 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm1[2,1,3,3]
771 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3],ymm9[4,5],ymm4[6,7]
772 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,0,2,1]
773 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
774 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3],ymm1[4,5],ymm5[6,7]
775 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm6[0,2,2,3]
776 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm2[2,1,3,3]
777 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7]
778 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1]
779 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,1,3]
780 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3],ymm2[4,5],ymm6[6,7]
781 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm7[0,2,2,3]
782 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm3[2,1,3,3]
783 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7]
784 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1]
785 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3]
786 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3],ymm3[4,5],ymm7[6,7]
787 ; AVX2-FCP-NEXT: vmovaps %ymm3, 192(%rdx)
788 ; AVX2-FCP-NEXT: vmovaps %ymm6, 224(%rdx)
789 ; AVX2-FCP-NEXT: vmovaps %ymm2, 128(%rdx)
790 ; AVX2-FCP-NEXT: vmovaps %ymm5, 160(%rdx)
791 ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rdx)
792 ; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rdx)
793 ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rdx)
794 ; AVX2-FCP-NEXT: vmovaps %ymm8, 32(%rdx)
795 ; AVX2-FCP-NEXT: vzeroupper
796 ; AVX2-FCP-NEXT: retq
798 ; AVX512-LABEL: store_i64_stride2_vf16:
800 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
801 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1
802 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm2
803 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm3
804 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15]
805 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5
806 ; AVX512-NEXT: vpermt2q %zmm2, %zmm4, %zmm5
807 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11]
808 ; AVX512-NEXT: vpermt2q %zmm2, %zmm6, %zmm0
809 ; AVX512-NEXT: vpermi2q %zmm3, %zmm1, %zmm4
810 ; AVX512-NEXT: vpermt2q %zmm3, %zmm6, %zmm1
811 ; AVX512-NEXT: vmovdqa64 %zmm1, 128(%rdx)
812 ; AVX512-NEXT: vmovdqa64 %zmm4, 192(%rdx)
813 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
814 ; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rdx)
815 ; AVX512-NEXT: vzeroupper
818 ; AVX512-FCP-LABEL: store_i64_stride2_vf16:
819 ; AVX512-FCP: # %bb.0:
820 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
821 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
822 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm2
823 ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3
824 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15]
825 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5
826 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm5
827 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11]
828 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm0
829 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm4
830 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm1
831 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 128(%rdx)
832 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 192(%rdx)
833 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
834 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rdx)
835 ; AVX512-FCP-NEXT: vzeroupper
836 ; AVX512-FCP-NEXT: retq
838 ; AVX512DQ-LABEL: store_i64_stride2_vf16:
840 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0
841 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1
842 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm2
843 ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm3
844 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15]
845 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5
846 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm4, %zmm5
847 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11]
848 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm6, %zmm0
849 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm1, %zmm4
850 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm6, %zmm1
851 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 128(%rdx)
852 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 192(%rdx)
853 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
854 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rdx)
855 ; AVX512DQ-NEXT: vzeroupper
856 ; AVX512DQ-NEXT: retq
858 ; AVX512DQ-FCP-LABEL: store_i64_stride2_vf16:
859 ; AVX512DQ-FCP: # %bb.0:
860 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
861 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
862 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm2
863 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3
864 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15]
865 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5
866 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm5
867 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11]
868 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm0
869 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm4
870 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm1
871 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 128(%rdx)
872 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 192(%rdx)
873 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
874 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%rdx)
875 ; AVX512DQ-FCP-NEXT: vzeroupper
876 ; AVX512DQ-FCP-NEXT: retq
878 ; AVX512BW-LABEL: store_i64_stride2_vf16:
880 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
881 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
882 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm2
883 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3
884 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15]
885 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5
886 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm5
887 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11]
888 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm0
889 ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm4
890 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm1
891 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 128(%rdx)
892 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rdx)
893 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
894 ; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rdx)
895 ; AVX512BW-NEXT: vzeroupper
896 ; AVX512BW-NEXT: retq
898 ; AVX512BW-FCP-LABEL: store_i64_stride2_vf16:
899 ; AVX512BW-FCP: # %bb.0:
900 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
901 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
902 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm2
903 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3
904 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15]
905 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5
906 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm5
907 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11]
908 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm0
909 ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm4
910 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm1
911 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%rdx)
912 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%rdx)
913 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
914 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rdx)
915 ; AVX512BW-FCP-NEXT: vzeroupper
916 ; AVX512BW-FCP-NEXT: retq
918 ; AVX512DQ-BW-LABEL: store_i64_stride2_vf16:
919 ; AVX512DQ-BW: # %bb.0:
920 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
921 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1
922 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm2
923 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm3
924 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15]
925 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5
926 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm5
927 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11]
928 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm0
929 ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm4
930 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm1
931 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 128(%rdx)
932 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 192(%rdx)
933 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rdx)
934 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%rdx)
935 ; AVX512DQ-BW-NEXT: vzeroupper
936 ; AVX512DQ-BW-NEXT: retq
938 ; AVX512DQ-BW-FCP-LABEL: store_i64_stride2_vf16:
939 ; AVX512DQ-BW-FCP: # %bb.0:
940 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
941 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
942 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm2
943 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3
944 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15]
945 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5
946 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm5
947 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11]
948 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm0
949 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm4
950 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm1
951 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%rdx)
952 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%rdx)
953 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
954 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rdx)
955 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
956 ; AVX512DQ-BW-FCP-NEXT: retq
957 %in.vec0 = load <16 x i64>, ptr %in.vecptr0, align 64
958 %in.vec1 = load <16 x i64>, ptr %in.vecptr1, align 64
959 %1 = shufflevector <16 x i64> %in.vec0, <16 x i64> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
960 %interleaved.vec = shufflevector <32 x i64> %1, <32 x i64> poison, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
961 store <32 x i64> %interleaved.vec, ptr %out.vec, align 64
965 define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
966 ; SSE-LABEL: store_i64_stride2_vf32:
968 ; SSE-NEXT: subq $152, %rsp
969 ; SSE-NEXT: movaps 112(%rdi), %xmm14
970 ; SSE-NEXT: movaps 96(%rdi), %xmm13
971 ; SSE-NEXT: movaps 80(%rdi), %xmm11
972 ; SSE-NEXT: movaps 64(%rdi), %xmm10
973 ; SSE-NEXT: movaps (%rdi), %xmm7
974 ; SSE-NEXT: movaps 16(%rdi), %xmm8
975 ; SSE-NEXT: movaps 32(%rdi), %xmm9
976 ; SSE-NEXT: movaps 48(%rdi), %xmm12
977 ; SSE-NEXT: movaps 96(%rsi), %xmm0
978 ; SSE-NEXT: movaps 80(%rsi), %xmm1
979 ; SSE-NEXT: movaps 64(%rsi), %xmm2
980 ; SSE-NEXT: movaps (%rsi), %xmm3
981 ; SSE-NEXT: movaps 16(%rsi), %xmm4
982 ; SSE-NEXT: movaps 32(%rsi), %xmm5
983 ; SSE-NEXT: movaps 48(%rsi), %xmm6
984 ; SSE-NEXT: movaps %xmm7, %xmm15
985 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm3[0]
986 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
987 ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1]
988 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
989 ; SSE-NEXT: movaps %xmm8, %xmm7
990 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm4[0]
991 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
992 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1]
993 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
994 ; SSE-NEXT: movaps %xmm9, %xmm4
995 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
996 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
997 ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm5[1]
998 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
999 ; SSE-NEXT: movaps %xmm12, %xmm4
1000 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0]
1001 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1002 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm6[1]
1003 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1004 ; SSE-NEXT: movaps %xmm10, %xmm3
1005 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
1006 ; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill
1007 ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1]
1008 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1009 ; SSE-NEXT: movaps %xmm11, %xmm2
1010 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
1011 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1012 ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1]
1013 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1014 ; SSE-NEXT: movaps %xmm13, %xmm1
1015 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1016 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1017 ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1]
1018 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1019 ; SSE-NEXT: movaps 112(%rsi), %xmm0
1020 ; SSE-NEXT: movaps %xmm14, %xmm1
1021 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1022 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1023 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1]
1024 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1025 ; SSE-NEXT: movaps 128(%rdi), %xmm15
1026 ; SSE-NEXT: movaps 128(%rsi), %xmm0
1027 ; SSE-NEXT: movaps %xmm15, %xmm1
1028 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1029 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1030 ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1]
1031 ; SSE-NEXT: movaps 144(%rdi), %xmm13
1032 ; SSE-NEXT: movaps 144(%rsi), %xmm0
1033 ; SSE-NEXT: movaps %xmm13, %xmm14
1034 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0]
1035 ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1]
1036 ; SSE-NEXT: movaps 160(%rdi), %xmm10
1037 ; SSE-NEXT: movaps 160(%rsi), %xmm0
1038 ; SSE-NEXT: movaps %xmm10, %xmm12
1039 ; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0]
1040 ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1]
1041 ; SSE-NEXT: movaps 176(%rdi), %xmm8
1042 ; SSE-NEXT: movaps 176(%rsi), %xmm0
1043 ; SSE-NEXT: movaps %xmm8, %xmm11
1044 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0]
1045 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1]
1046 ; SSE-NEXT: movaps 192(%rdi), %xmm6
1047 ; SSE-NEXT: movaps 192(%rsi), %xmm0
1048 ; SSE-NEXT: movaps %xmm6, %xmm9
1049 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0]
1050 ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1]
1051 ; SSE-NEXT: movaps 208(%rdi), %xmm5
1052 ; SSE-NEXT: movaps 208(%rsi), %xmm1
1053 ; SSE-NEXT: movaps %xmm5, %xmm7
1054 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0]
1055 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
1056 ; SSE-NEXT: movaps 224(%rdi), %xmm1
1057 ; SSE-NEXT: movaps 224(%rsi), %xmm3
1058 ; SSE-NEXT: movaps %xmm1, %xmm2
1059 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1060 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
1061 ; SSE-NEXT: movaps 240(%rdi), %xmm3
1062 ; SSE-NEXT: movaps 240(%rsi), %xmm4
1063 ; SSE-NEXT: movaps %xmm3, %xmm0
1064 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
1065 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1]
1066 ; SSE-NEXT: movaps %xmm3, 496(%rdx)
1067 ; SSE-NEXT: movaps %xmm0, 480(%rdx)
1068 ; SSE-NEXT: movaps %xmm1, 464(%rdx)
1069 ; SSE-NEXT: movaps %xmm2, 448(%rdx)
1070 ; SSE-NEXT: movaps %xmm5, 432(%rdx)
1071 ; SSE-NEXT: movaps %xmm7, 416(%rdx)
1072 ; SSE-NEXT: movaps %xmm6, 400(%rdx)
1073 ; SSE-NEXT: movaps %xmm9, 384(%rdx)
1074 ; SSE-NEXT: movaps %xmm8, 368(%rdx)
1075 ; SSE-NEXT: movaps %xmm11, 352(%rdx)
1076 ; SSE-NEXT: movaps %xmm10, 336(%rdx)
1077 ; SSE-NEXT: movaps %xmm12, 320(%rdx)
1078 ; SSE-NEXT: movaps %xmm13, 304(%rdx)
1079 ; SSE-NEXT: movaps %xmm14, 288(%rdx)
1080 ; SSE-NEXT: movaps %xmm15, 272(%rdx)
1081 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1082 ; SSE-NEXT: movaps %xmm0, 256(%rdx)
1083 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1084 ; SSE-NEXT: movaps %xmm0, 240(%rdx)
1085 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1086 ; SSE-NEXT: movaps %xmm0, 224(%rdx)
1087 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1088 ; SSE-NEXT: movaps %xmm0, 208(%rdx)
1089 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1090 ; SSE-NEXT: movaps %xmm0, 192(%rdx)
1091 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1092 ; SSE-NEXT: movaps %xmm0, 176(%rdx)
1093 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1094 ; SSE-NEXT: movaps %xmm0, 160(%rdx)
1095 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1096 ; SSE-NEXT: movaps %xmm0, 144(%rdx)
1097 ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
1098 ; SSE-NEXT: movaps %xmm0, 128(%rdx)
1099 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1100 ; SSE-NEXT: movaps %xmm0, 112(%rdx)
1101 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1102 ; SSE-NEXT: movaps %xmm0, 96(%rdx)
1103 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1104 ; SSE-NEXT: movaps %xmm0, 80(%rdx)
1105 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1106 ; SSE-NEXT: movaps %xmm0, 64(%rdx)
1107 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1108 ; SSE-NEXT: movaps %xmm0, 48(%rdx)
1109 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1110 ; SSE-NEXT: movaps %xmm0, 32(%rdx)
1111 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1112 ; SSE-NEXT: movaps %xmm0, 16(%rdx)
1113 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1114 ; SSE-NEXT: movaps %xmm0, (%rdx)
1115 ; SSE-NEXT: addq $152, %rsp
1118 ; AVX-LABEL: store_i64_stride2_vf32:
1120 ; AVX-NEXT: vmovaps 224(%rsi), %xmm0
1121 ; AVX-NEXT: vmovaps 224(%rdi), %xmm1
1122 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
1123 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1124 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1125 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1126 ; AVX-NEXT: vmovaps 128(%rsi), %xmm1
1127 ; AVX-NEXT: vmovaps 128(%rdi), %xmm2
1128 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm1[1]
1129 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
1130 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
1131 ; AVX-NEXT: vmovaps (%rsi), %xmm2
1132 ; AVX-NEXT: vmovaps 32(%rsi), %xmm3
1133 ; AVX-NEXT: vmovaps 64(%rsi), %xmm4
1134 ; AVX-NEXT: vmovaps 96(%rsi), %xmm5
1135 ; AVX-NEXT: vmovaps (%rdi), %xmm6
1136 ; AVX-NEXT: vmovaps 32(%rdi), %xmm7
1137 ; AVX-NEXT: vmovaps 64(%rdi), %xmm8
1138 ; AVX-NEXT: vmovaps 96(%rdi), %xmm9
1139 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm6[1],xmm2[1]
1140 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm2[0]
1141 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm2
1142 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm7[1],xmm3[1]
1143 ; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm7[0],xmm3[0]
1144 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3
1145 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm8[1],xmm4[1]
1146 ; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm4[0]
1147 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
1148 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm9[1],xmm5[1]
1149 ; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm9[0],xmm5[0]
1150 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
1151 ; AVX-NEXT: vmovaps 160(%rsi), %xmm6
1152 ; AVX-NEXT: vmovaps 160(%rdi), %xmm7
1153 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm7[1],xmm6[1]
1154 ; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm7[0],xmm6[0]
1155 ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6
1156 ; AVX-NEXT: vmovaps 192(%rsi), %xmm7
1157 ; AVX-NEXT: vmovaps 192(%rdi), %xmm8
1158 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm7[1]
1159 ; AVX-NEXT: vmovlhps {{.*#+}} xmm7 = xmm8[0],xmm7[0]
1160 ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7
1161 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3,2,3]
1162 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = mem[2,3,2,3]
1163 ; AVX-NEXT: vshufpd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[3],ymm8[3]
1164 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = mem[2,3,2,3]
1165 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm10 = mem[2,3,2,3]
1166 ; AVX-NEXT: vshufpd {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[3],ymm9[3]
1167 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm10 = mem[2,3,2,3]
1168 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = mem[2,3,2,3]
1169 ; AVX-NEXT: vshufpd {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[3],ymm10[3]
1170 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = mem[2,3,2,3]
1171 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm12 = mem[2,3,2,3]
1172 ; AVX-NEXT: vshufpd {{.*#+}} ymm11 = ymm12[0],ymm11[0],ymm12[3],ymm11[3]
1173 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm12 = mem[2,3,2,3]
1174 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm13 = mem[2,3,2,3]
1175 ; AVX-NEXT: vshufpd {{.*#+}} ymm12 = ymm13[0],ymm12[0],ymm13[3],ymm12[3]
1176 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm13 = mem[2,3,2,3]
1177 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = mem[2,3,2,3]
1178 ; AVX-NEXT: vshufpd {{.*#+}} ymm13 = ymm14[0],ymm13[0],ymm14[3],ymm13[3]
1179 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = mem[2,3,2,3]
1180 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm15 = mem[2,3,2,3]
1181 ; AVX-NEXT: vshufpd {{.*#+}} ymm14 = ymm15[0],ymm14[0],ymm15[3],ymm14[3]
1182 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm15 = mem[2,3,2,3]
1183 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3]
1184 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[3],ymm15[3]
1185 ; AVX-NEXT: vmovapd %ymm0, 416(%rdx)
1186 ; AVX-NEXT: vmovapd %ymm14, 352(%rdx)
1187 ; AVX-NEXT: vmovapd %ymm13, 224(%rdx)
1188 ; AVX-NEXT: vmovapd %ymm12, 32(%rdx)
1189 ; AVX-NEXT: vmovapd %ymm11, 96(%rdx)
1190 ; AVX-NEXT: vmovapd %ymm10, 160(%rdx)
1191 ; AVX-NEXT: vmovapd %ymm9, 288(%rdx)
1192 ; AVX-NEXT: vmovapd %ymm8, 480(%rdx)
1193 ; AVX-NEXT: vmovaps %ymm7, 384(%rdx)
1194 ; AVX-NEXT: vmovaps %ymm6, 320(%rdx)
1195 ; AVX-NEXT: vmovaps %ymm5, 192(%rdx)
1196 ; AVX-NEXT: vmovaps %ymm4, 128(%rdx)
1197 ; AVX-NEXT: vmovaps %ymm3, 64(%rdx)
1198 ; AVX-NEXT: vmovaps %ymm2, (%rdx)
1199 ; AVX-NEXT: vmovaps %ymm1, 256(%rdx)
1200 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1201 ; AVX-NEXT: vmovaps %ymm0, 448(%rdx)
1202 ; AVX-NEXT: vzeroupper
1205 ; AVX2-LABEL: store_i64_stride2_vf32:
1207 ; AVX2-NEXT: vmovaps 192(%rdi), %ymm2
1208 ; AVX2-NEXT: vmovaps 160(%rdi), %ymm5
1209 ; AVX2-NEXT: vmovaps 128(%rdi), %ymm9
1210 ; AVX2-NEXT: vmovaps (%rdi), %ymm1
1211 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm4
1212 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm8
1213 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm12
1214 ; AVX2-NEXT: vmovaps 192(%rsi), %ymm6
1215 ; AVX2-NEXT: vmovaps 160(%rsi), %ymm10
1216 ; AVX2-NEXT: vmovaps 128(%rsi), %ymm13
1217 ; AVX2-NEXT: vmovaps (%rsi), %ymm3
1218 ; AVX2-NEXT: vmovaps 32(%rsi), %ymm7
1219 ; AVX2-NEXT: vmovaps 64(%rsi), %ymm11
1220 ; AVX2-NEXT: vmovaps 96(%rsi), %ymm14
1221 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,2,2,3]
1222 ; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm1[2,1,3,3]
1223 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7]
1224 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1225 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1]
1226 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
1227 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7]
1228 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1229 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm7[0,2,2,3]
1230 ; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm4[2,1,3,3]
1231 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3],ymm15[4,5],ymm3[6,7]
1232 ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1]
1233 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,1,3]
1234 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5],ymm7[6,7]
1235 ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[0,2,2,3]
1236 ; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm8[2,1,3,3]
1237 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm7[2,3],ymm15[4,5],ymm7[6,7]
1238 ; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,0,2,1]
1239 ; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,1,3]
1240 ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm11[2,3],ymm8[4,5],ymm11[6,7]
1241 ; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm14[0,2,2,3]
1242 ; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm12[2,1,3,3]
1243 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
1244 ; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,0,2,1]
1245 ; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,1,3]
1246 ; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3],ymm12[4,5],ymm14[6,7]
1247 ; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm13[0,2,2,3]
1248 ; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm9[2,1,3,3]
1249 ; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7]
1250 ; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,0,2,1]
1251 ; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,1,3]
1252 ; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm13[2,3],ymm9[4,5],ymm13[6,7]
1253 ; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm10[0,2,2,3]
1254 ; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm5[2,1,3,3]
1255 ; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5],ymm13[6,7]
1256 ; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,0,2,1]
1257 ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3]
1258 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm10[2,3],ymm5[4,5],ymm10[6,7]
1259 ; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[0,2,2,3]
1260 ; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm2[2,1,3,3]
1261 ; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7]
1262 ; AVX2-NEXT: vmovaps 224(%rdi), %ymm15
1263 ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1]
1264 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,1,3]
1265 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3],ymm2[4,5],ymm6[6,7]
1266 ; AVX2-NEXT: vmovaps 224(%rsi), %ymm6
1267 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,2,3]
1268 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm15[2,1,3,3]
1269 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
1270 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[0,0,2,1]
1271 ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm15[0,1,1,3]
1272 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7]
1273 ; AVX2-NEXT: vmovaps %ymm1, 448(%rdx)
1274 ; AVX2-NEXT: vmovaps %ymm0, 480(%rdx)
1275 ; AVX2-NEXT: vmovaps %ymm2, 384(%rdx)
1276 ; AVX2-NEXT: vmovaps %ymm10, 416(%rdx)
1277 ; AVX2-NEXT: vmovaps %ymm5, 320(%rdx)
1278 ; AVX2-NEXT: vmovaps %ymm13, 352(%rdx)
1279 ; AVX2-NEXT: vmovaps %ymm9, 256(%rdx)
1280 ; AVX2-NEXT: vmovaps %ymm14, 288(%rdx)
1281 ; AVX2-NEXT: vmovaps %ymm12, 192(%rdx)
1282 ; AVX2-NEXT: vmovaps %ymm11, 224(%rdx)
1283 ; AVX2-NEXT: vmovaps %ymm8, 128(%rdx)
1284 ; AVX2-NEXT: vmovaps %ymm7, 160(%rdx)
1285 ; AVX2-NEXT: vmovaps %ymm4, 64(%rdx)
1286 ; AVX2-NEXT: vmovaps %ymm3, 96(%rdx)
1287 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1288 ; AVX2-NEXT: vmovaps %ymm0, (%rdx)
1289 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1290 ; AVX2-NEXT: vmovaps %ymm0, 32(%rdx)
1291 ; AVX2-NEXT: vzeroupper
1294 ; AVX2-FP-LABEL: store_i64_stride2_vf32:
1296 ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm2
1297 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm5
1298 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm9
1299 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm1
1300 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm4
1301 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm8
1302 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm12
1303 ; AVX2-FP-NEXT: vmovaps 192(%rsi), %ymm6
1304 ; AVX2-FP-NEXT: vmovaps 160(%rsi), %ymm10
1305 ; AVX2-FP-NEXT: vmovaps 128(%rsi), %ymm13
1306 ; AVX2-FP-NEXT: vmovaps (%rsi), %ymm3
1307 ; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm7
1308 ; AVX2-FP-NEXT: vmovaps 64(%rsi), %ymm11
1309 ; AVX2-FP-NEXT: vmovaps 96(%rsi), %ymm14
1310 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,2,2,3]
1311 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm1[2,1,3,3]
1312 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7]
1313 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1314 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1]
1315 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
1316 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7]
1317 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1318 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm7[0,2,2,3]
1319 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm4[2,1,3,3]
1320 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3],ymm15[4,5],ymm3[6,7]
1321 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1]
1322 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,1,3]
1323 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5],ymm7[6,7]
1324 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[0,2,2,3]
1325 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm8[2,1,3,3]
1326 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm7[2,3],ymm15[4,5],ymm7[6,7]
1327 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,0,2,1]
1328 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,1,3]
1329 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm11[2,3],ymm8[4,5],ymm11[6,7]
1330 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = ymm14[0,2,2,3]
1331 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm12[2,1,3,3]
1332 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
1333 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,0,2,1]
1334 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,1,3]
1335 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3],ymm12[4,5],ymm14[6,7]
1336 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm13[0,2,2,3]
1337 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm9[2,1,3,3]
1338 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7]
1339 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,0,2,1]
1340 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,1,3]
1341 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm13[2,3],ymm9[4,5],ymm13[6,7]
1342 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm10[0,2,2,3]
1343 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm5[2,1,3,3]
1344 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5],ymm13[6,7]
1345 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,0,2,1]
1346 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3]
1347 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm10[2,3],ymm5[4,5],ymm10[6,7]
1348 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[0,2,2,3]
1349 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm2[2,1,3,3]
1350 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7]
1351 ; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm15
1352 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1]
1353 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,1,3]
1354 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3],ymm2[4,5],ymm6[6,7]
1355 ; AVX2-FP-NEXT: vmovaps 224(%rsi), %ymm6
1356 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,2,3]
1357 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm15[2,1,3,3]
1358 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
1359 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[0,0,2,1]
1360 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm15[0,1,1,3]
1361 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7]
1362 ; AVX2-FP-NEXT: vmovaps %ymm1, 448(%rdx)
1363 ; AVX2-FP-NEXT: vmovaps %ymm0, 480(%rdx)
1364 ; AVX2-FP-NEXT: vmovaps %ymm2, 384(%rdx)
1365 ; AVX2-FP-NEXT: vmovaps %ymm10, 416(%rdx)
1366 ; AVX2-FP-NEXT: vmovaps %ymm5, 320(%rdx)
1367 ; AVX2-FP-NEXT: vmovaps %ymm13, 352(%rdx)
1368 ; AVX2-FP-NEXT: vmovaps %ymm9, 256(%rdx)
1369 ; AVX2-FP-NEXT: vmovaps %ymm14, 288(%rdx)
1370 ; AVX2-FP-NEXT: vmovaps %ymm12, 192(%rdx)
1371 ; AVX2-FP-NEXT: vmovaps %ymm11, 224(%rdx)
1372 ; AVX2-FP-NEXT: vmovaps %ymm8, 128(%rdx)
1373 ; AVX2-FP-NEXT: vmovaps %ymm7, 160(%rdx)
1374 ; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rdx)
1375 ; AVX2-FP-NEXT: vmovaps %ymm3, 96(%rdx)
1376 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1377 ; AVX2-FP-NEXT: vmovaps %ymm0, (%rdx)
1378 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1379 ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rdx)
1380 ; AVX2-FP-NEXT: vzeroupper
1381 ; AVX2-FP-NEXT: retq
1383 ; AVX2-FCP-LABEL: store_i64_stride2_vf32:
1384 ; AVX2-FCP: # %bb.0:
1385 ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm2
1386 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm5
1387 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm9
1388 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm1
1389 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm4
1390 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm8
1391 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm12
1392 ; AVX2-FCP-NEXT: vmovaps 192(%rsi), %ymm6
1393 ; AVX2-FCP-NEXT: vmovaps 160(%rsi), %ymm10
1394 ; AVX2-FCP-NEXT: vmovaps 128(%rsi), %ymm13
1395 ; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm3
1396 ; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm7
1397 ; AVX2-FCP-NEXT: vmovaps 64(%rsi), %ymm11
1398 ; AVX2-FCP-NEXT: vmovaps 96(%rsi), %ymm14
1399 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,2,2,3]
1400 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm1[2,1,3,3]
1401 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7]
1402 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1403 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1]
1404 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
1405 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7]
1406 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1407 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm7[0,2,2,3]
1408 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm4[2,1,3,3]
1409 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3],ymm15[4,5],ymm3[6,7]
1410 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1]
1411 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,1,3]
1412 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5],ymm7[6,7]
1413 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[0,2,2,3]
1414 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm8[2,1,3,3]
1415 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm7[2,3],ymm15[4,5],ymm7[6,7]
1416 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,0,2,1]
1417 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,1,3]
1418 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm11[2,3],ymm8[4,5],ymm11[6,7]
1419 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm14[0,2,2,3]
1420 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm12[2,1,3,3]
1421 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
1422 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,0,2,1]
1423 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,1,3]
1424 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3],ymm12[4,5],ymm14[6,7]
1425 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm13[0,2,2,3]
1426 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm9[2,1,3,3]
1427 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7]
1428 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,0,2,1]
1429 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,1,3]
1430 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm13[2,3],ymm9[4,5],ymm13[6,7]
1431 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm13 = ymm10[0,2,2,3]
1432 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm5[2,1,3,3]
1433 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5],ymm13[6,7]
1434 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,0,2,1]
1435 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3]
1436 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm10[2,3],ymm5[4,5],ymm10[6,7]
1437 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[0,2,2,3]
1438 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm2[2,1,3,3]
1439 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7]
1440 ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm15
1441 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1]
1442 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,1,3]
1443 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3],ymm2[4,5],ymm6[6,7]
1444 ; AVX2-FCP-NEXT: vmovaps 224(%rsi), %ymm6
1445 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,2,3]
1446 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm15[2,1,3,3]
1447 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
1448 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[0,0,2,1]
1449 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm15[0,1,1,3]
1450 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7]
1451 ; AVX2-FCP-NEXT: vmovaps %ymm1, 448(%rdx)
1452 ; AVX2-FCP-NEXT: vmovaps %ymm0, 480(%rdx)
1453 ; AVX2-FCP-NEXT: vmovaps %ymm2, 384(%rdx)
1454 ; AVX2-FCP-NEXT: vmovaps %ymm10, 416(%rdx)
1455 ; AVX2-FCP-NEXT: vmovaps %ymm5, 320(%rdx)
1456 ; AVX2-FCP-NEXT: vmovaps %ymm13, 352(%rdx)
1457 ; AVX2-FCP-NEXT: vmovaps %ymm9, 256(%rdx)
1458 ; AVX2-FCP-NEXT: vmovaps %ymm14, 288(%rdx)
1459 ; AVX2-FCP-NEXT: vmovaps %ymm12, 192(%rdx)
1460 ; AVX2-FCP-NEXT: vmovaps %ymm11, 224(%rdx)
1461 ; AVX2-FCP-NEXT: vmovaps %ymm8, 128(%rdx)
1462 ; AVX2-FCP-NEXT: vmovaps %ymm7, 160(%rdx)
1463 ; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rdx)
1464 ; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%rdx)
1465 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1466 ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rdx)
1467 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1468 ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rdx)
1469 ; AVX2-FCP-NEXT: vzeroupper
1470 ; AVX2-FCP-NEXT: retq
1472 ; AVX512-LABEL: store_i64_stride2_vf32:
1474 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
1475 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1
1476 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2
1477 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3
1478 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm4
1479 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm5
1480 ; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm6
1481 ; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm7
1482 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15]
1483 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9
1484 ; AVX512-NEXT: vpermt2q %zmm4, %zmm8, %zmm9
1485 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11]
1486 ; AVX512-NEXT: vpermt2q %zmm4, %zmm10, %zmm0
1487 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4
1488 ; AVX512-NEXT: vpermt2q %zmm5, %zmm8, %zmm4
1489 ; AVX512-NEXT: vpermt2q %zmm5, %zmm10, %zmm1
1490 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5
1491 ; AVX512-NEXT: vpermt2q %zmm6, %zmm8, %zmm5
1492 ; AVX512-NEXT: vpermt2q %zmm6, %zmm10, %zmm2
1493 ; AVX512-NEXT: vpermi2q %zmm7, %zmm3, %zmm8
1494 ; AVX512-NEXT: vpermt2q %zmm7, %zmm10, %zmm3
1495 ; AVX512-NEXT: vmovdqa64 %zmm3, 384(%rdx)
1496 ; AVX512-NEXT: vmovdqa64 %zmm8, 448(%rdx)
1497 ; AVX512-NEXT: vmovdqa64 %zmm2, 256(%rdx)
1498 ; AVX512-NEXT: vmovdqa64 %zmm5, 320(%rdx)
1499 ; AVX512-NEXT: vmovdqa64 %zmm1, 128(%rdx)
1500 ; AVX512-NEXT: vmovdqa64 %zmm4, 192(%rdx)
1501 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
1502 ; AVX512-NEXT: vmovdqa64 %zmm9, 64(%rdx)
1503 ; AVX512-NEXT: vzeroupper
1506 ; AVX512-FCP-LABEL: store_i64_stride2_vf32:
1507 ; AVX512-FCP: # %bb.0:
1508 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1509 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
1510 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
1511 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
1512 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm4
1513 ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5
1514 ; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm6
1515 ; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm7
1516 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15]
1517 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9
1518 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm9
1519 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11]
1520 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm0
1521 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4
1522 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm4
1523 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm1
1524 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5
1525 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm5
1526 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm2
1527 ; AVX512-FCP-NEXT: vpermi2q %zmm7, %zmm3, %zmm8
1528 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm3
1529 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 384(%rdx)
1530 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 448(%rdx)
1531 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 256(%rdx)
1532 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 320(%rdx)
1533 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 128(%rdx)
1534 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 192(%rdx)
1535 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
1536 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 64(%rdx)
1537 ; AVX512-FCP-NEXT: vzeroupper
1538 ; AVX512-FCP-NEXT: retq
1540 ; AVX512DQ-LABEL: store_i64_stride2_vf32:
1541 ; AVX512DQ: # %bb.0:
1542 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0
1543 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1
1544 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2
1545 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3
1546 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm4
1547 ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm5
1548 ; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm6
1549 ; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm7
1550 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15]
1551 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm9
1552 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm8, %zmm9
1553 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11]
1554 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm10, %zmm0
1555 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4
1556 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm8, %zmm4
1557 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm10, %zmm1
1558 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5
1559 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm8, %zmm5
1560 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm10, %zmm2
1561 ; AVX512DQ-NEXT: vpermi2q %zmm7, %zmm3, %zmm8
1562 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm10, %zmm3
1563 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 384(%rdx)
1564 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 448(%rdx)
1565 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 256(%rdx)
1566 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 320(%rdx)
1567 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 128(%rdx)
1568 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 192(%rdx)
1569 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
1570 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 64(%rdx)
1571 ; AVX512DQ-NEXT: vzeroupper
1572 ; AVX512DQ-NEXT: retq
1574 ; AVX512DQ-FCP-LABEL: store_i64_stride2_vf32:
1575 ; AVX512DQ-FCP: # %bb.0:
1576 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1577 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
1578 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
1579 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
1580 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm4
1581 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5
1582 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm6
1583 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm7
1584 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15]
1585 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9
1586 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm9
1587 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11]
1588 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm0
1589 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4
1590 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm4
1591 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm1
1592 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5
1593 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm5
1594 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm2
1595 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm3, %zmm8
1596 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm3
1597 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 384(%rdx)
1598 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 448(%rdx)
1599 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 256(%rdx)
1600 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 320(%rdx)
1601 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 128(%rdx)
1602 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 192(%rdx)
1603 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
1604 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 64(%rdx)
1605 ; AVX512DQ-FCP-NEXT: vzeroupper
1606 ; AVX512DQ-FCP-NEXT: retq
1608 ; AVX512BW-LABEL: store_i64_stride2_vf32:
1609 ; AVX512BW: # %bb.0:
1610 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1611 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
1612 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2
1613 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3
1614 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm4
1615 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm5
1616 ; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm6
1617 ; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm7
1618 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15]
1619 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9
1620 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm9
1621 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11]
1622 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm0
1623 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4
1624 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm8, %zmm4
1625 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm1
1626 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5
1627 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm5
1628 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm10, %zmm2
1629 ; AVX512BW-NEXT: vpermi2q %zmm7, %zmm3, %zmm8
1630 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm10, %zmm3
1631 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 384(%rdx)
1632 ; AVX512BW-NEXT: vmovdqa64 %zmm8, 448(%rdx)
1633 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 256(%rdx)
1634 ; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx)
1635 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 128(%rdx)
1636 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rdx)
1637 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
1638 ; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rdx)
1639 ; AVX512BW-NEXT: vzeroupper
1640 ; AVX512BW-NEXT: retq
1642 ; AVX512BW-FCP-LABEL: store_i64_stride2_vf32:
1643 ; AVX512BW-FCP: # %bb.0:
1644 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1645 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
1646 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
1647 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
1648 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm4
1649 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5
1650 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm6
1651 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm7
1652 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15]
1653 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9
1654 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm9
1655 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11]
1656 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm0
1657 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4
1658 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm4
1659 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm1
1660 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5
1661 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm5
1662 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm2
1663 ; AVX512BW-FCP-NEXT: vpermi2q %zmm7, %zmm3, %zmm8
1664 ; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm3
1665 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 384(%rdx)
1666 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 448(%rdx)
1667 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 256(%rdx)
1668 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 320(%rdx)
1669 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%rdx)
1670 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%rdx)
1671 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
1672 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%rdx)
1673 ; AVX512BW-FCP-NEXT: vzeroupper
1674 ; AVX512BW-FCP-NEXT: retq
1676 ; AVX512DQ-BW-LABEL: store_i64_stride2_vf32:
1677 ; AVX512DQ-BW: # %bb.0:
1678 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
1679 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1
1680 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2
1681 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3
1682 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm4
1683 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm5
1684 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm6
1685 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm7
1686 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15]
1687 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm9
1688 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm9
1689 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11]
1690 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm0
1691 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4
1692 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm8, %zmm4
1693 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm1
1694 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5
1695 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm5
1696 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm10, %zmm2
1697 ; AVX512DQ-BW-NEXT: vpermi2q %zmm7, %zmm3, %zmm8
1698 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm10, %zmm3
1699 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 384(%rdx)
1700 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 448(%rdx)
1701 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 256(%rdx)
1702 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 320(%rdx)
1703 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 128(%rdx)
1704 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 192(%rdx)
1705 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rdx)
1706 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 64(%rdx)
1707 ; AVX512DQ-BW-NEXT: vzeroupper
1708 ; AVX512DQ-BW-NEXT: retq
1710 ; AVX512DQ-BW-FCP-LABEL: store_i64_stride2_vf32:
1711 ; AVX512DQ-BW-FCP: # %bb.0:
1712 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1713 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
1714 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
1715 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
1716 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm4
1717 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5
1718 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm6
1719 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm7
1720 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15]
1721 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9
1722 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm9
1723 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11]
1724 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm0
1725 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4
1726 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm4
1727 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm1
1728 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5
1729 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm5
1730 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm2
1731 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm7, %zmm3, %zmm8
1732 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm3
1733 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 384(%rdx)
1734 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 448(%rdx)
1735 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 256(%rdx)
1736 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 320(%rdx)
1737 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%rdx)
1738 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%rdx)
1739 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
1740 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%rdx)
1741 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
1742 ; AVX512DQ-BW-FCP-NEXT: retq
1743 %in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64
1744 %in.vec1 = load <32 x i64>, ptr %in.vecptr1, align 64
1745 %1 = shufflevector <32 x i64> %in.vec0, <32 x i64> %in.vec1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
1746 %interleaved.vec = shufflevector <64 x i64> %1, <64 x i64> poison, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
1747 store <64 x i64> %interleaved.vec, ptr %out.vec, align 64
1751 define void @store_i64_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
1752 ; SSE-LABEL: store_i64_stride2_vf64:
1754 ; SSE-NEXT: subq $664, %rsp # imm = 0x298
1755 ; SSE-NEXT: movaps 112(%rdi), %xmm14
1756 ; SSE-NEXT: movaps 96(%rdi), %xmm13
1757 ; SSE-NEXT: movaps 80(%rdi), %xmm11
1758 ; SSE-NEXT: movaps 64(%rdi), %xmm10
1759 ; SSE-NEXT: movaps (%rdi), %xmm7
1760 ; SSE-NEXT: movaps 16(%rdi), %xmm8
1761 ; SSE-NEXT: movaps 32(%rdi), %xmm9
1762 ; SSE-NEXT: movaps 48(%rdi), %xmm12
1763 ; SSE-NEXT: movaps 96(%rsi), %xmm0
1764 ; SSE-NEXT: movaps 80(%rsi), %xmm1
1765 ; SSE-NEXT: movaps 64(%rsi), %xmm2
1766 ; SSE-NEXT: movaps (%rsi), %xmm3
1767 ; SSE-NEXT: movaps 16(%rsi), %xmm4
1768 ; SSE-NEXT: movaps 32(%rsi), %xmm5
1769 ; SSE-NEXT: movaps 48(%rsi), %xmm6
1770 ; SSE-NEXT: movaps %xmm7, %xmm15
1771 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm3[0]
1772 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1773 ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1]
1774 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1775 ; SSE-NEXT: movaps %xmm8, %xmm3
1776 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
1777 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1778 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1]
1779 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1780 ; SSE-NEXT: movaps %xmm9, %xmm3
1781 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm5[0]
1782 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1783 ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm5[1]
1784 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1785 ; SSE-NEXT: movaps %xmm12, %xmm3
1786 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0]
1787 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1788 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm6[1]
1789 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1790 ; SSE-NEXT: movaps %xmm10, %xmm3
1791 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
1792 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1793 ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1]
1794 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1795 ; SSE-NEXT: movaps %xmm11, %xmm2
1796 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
1797 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1798 ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1]
1799 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1800 ; SSE-NEXT: movaps %xmm13, %xmm1
1801 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1802 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1803 ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1]
1804 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1805 ; SSE-NEXT: movaps 112(%rsi), %xmm0
1806 ; SSE-NEXT: movaps %xmm14, %xmm1
1807 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1808 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1809 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1]
1810 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1811 ; SSE-NEXT: movaps 128(%rdi), %xmm1
1812 ; SSE-NEXT: movaps 128(%rsi), %xmm0
1813 ; SSE-NEXT: movaps %xmm1, %xmm2
1814 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1815 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1816 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1817 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1818 ; SSE-NEXT: movaps 144(%rdi), %xmm1
1819 ; SSE-NEXT: movaps 144(%rsi), %xmm0
1820 ; SSE-NEXT: movaps %xmm1, %xmm2
1821 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1822 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1823 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1824 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1825 ; SSE-NEXT: movaps 160(%rdi), %xmm1
1826 ; SSE-NEXT: movaps 160(%rsi), %xmm0
1827 ; SSE-NEXT: movaps %xmm1, %xmm2
1828 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1829 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1830 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1831 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1832 ; SSE-NEXT: movaps 176(%rdi), %xmm1
1833 ; SSE-NEXT: movaps 176(%rsi), %xmm0
1834 ; SSE-NEXT: movaps %xmm1, %xmm2
1835 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1836 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1837 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1838 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1839 ; SSE-NEXT: movaps 192(%rdi), %xmm1
1840 ; SSE-NEXT: movaps 192(%rsi), %xmm0
1841 ; SSE-NEXT: movaps %xmm1, %xmm2
1842 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1843 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1844 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1845 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1846 ; SSE-NEXT: movaps 208(%rdi), %xmm1
1847 ; SSE-NEXT: movaps 208(%rsi), %xmm0
1848 ; SSE-NEXT: movaps %xmm1, %xmm2
1849 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1850 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1851 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1852 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1853 ; SSE-NEXT: movaps 224(%rdi), %xmm1
1854 ; SSE-NEXT: movaps 224(%rsi), %xmm0
1855 ; SSE-NEXT: movaps %xmm1, %xmm2
1856 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1857 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1858 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1859 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1860 ; SSE-NEXT: movaps 240(%rdi), %xmm1
1861 ; SSE-NEXT: movaps 240(%rsi), %xmm0
1862 ; SSE-NEXT: movaps %xmm1, %xmm2
1863 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1864 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1865 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1866 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1867 ; SSE-NEXT: movaps 256(%rdi), %xmm1
1868 ; SSE-NEXT: movaps 256(%rsi), %xmm0
1869 ; SSE-NEXT: movaps %xmm1, %xmm2
1870 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1871 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1872 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1873 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1874 ; SSE-NEXT: movaps 272(%rdi), %xmm1
1875 ; SSE-NEXT: movaps 272(%rsi), %xmm0
1876 ; SSE-NEXT: movaps %xmm1, %xmm2
1877 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1878 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1879 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1880 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1881 ; SSE-NEXT: movaps 288(%rdi), %xmm1
1882 ; SSE-NEXT: movaps 288(%rsi), %xmm0
1883 ; SSE-NEXT: movaps %xmm1, %xmm2
1884 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1885 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1886 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1887 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1888 ; SSE-NEXT: movaps 304(%rdi), %xmm1
1889 ; SSE-NEXT: movaps 304(%rsi), %xmm0
1890 ; SSE-NEXT: movaps %xmm1, %xmm2
1891 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1892 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1893 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1894 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1895 ; SSE-NEXT: movaps 320(%rdi), %xmm1
1896 ; SSE-NEXT: movaps 320(%rsi), %xmm0
1897 ; SSE-NEXT: movaps %xmm1, %xmm2
1898 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1899 ; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill
1900 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1901 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1902 ; SSE-NEXT: movaps 336(%rdi), %xmm1
1903 ; SSE-NEXT: movaps 336(%rsi), %xmm0
1904 ; SSE-NEXT: movaps %xmm1, %xmm2
1905 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1906 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1907 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1908 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1909 ; SSE-NEXT: movaps 352(%rdi), %xmm1
1910 ; SSE-NEXT: movaps 352(%rsi), %xmm0
1911 ; SSE-NEXT: movaps %xmm1, %xmm2
1912 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1913 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1914 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1915 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1916 ; SSE-NEXT: movaps 368(%rdi), %xmm15
1917 ; SSE-NEXT: movaps 368(%rsi), %xmm0
1918 ; SSE-NEXT: movaps %xmm15, %xmm1
1919 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1920 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1921 ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1]
1922 ; SSE-NEXT: movaps 384(%rdi), %xmm13
1923 ; SSE-NEXT: movaps 384(%rsi), %xmm0
1924 ; SSE-NEXT: movaps %xmm13, %xmm1
1925 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1926 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1927 ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1]
1928 ; SSE-NEXT: movaps 400(%rdi), %xmm11
1929 ; SSE-NEXT: movaps 400(%rsi), %xmm0
1930 ; SSE-NEXT: movaps %xmm11, %xmm1
1931 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1932 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1933 ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1]
1934 ; SSE-NEXT: movaps 416(%rdi), %xmm12
1935 ; SSE-NEXT: movaps 416(%rsi), %xmm0
1936 ; SSE-NEXT: movaps %xmm12, %xmm14
1937 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0]
1938 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1]
1939 ; SSE-NEXT: movaps 432(%rdi), %xmm8
1940 ; SSE-NEXT: movaps 432(%rsi), %xmm0
1941 ; SSE-NEXT: movaps %xmm8, %xmm10
1942 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0]
1943 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1]
1944 ; SSE-NEXT: movaps 448(%rdi), %xmm6
1945 ; SSE-NEXT: movaps 448(%rsi), %xmm0
1946 ; SSE-NEXT: movaps %xmm6, %xmm9
1947 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0]
1948 ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1]
1949 ; SSE-NEXT: movaps 464(%rdi), %xmm5
1950 ; SSE-NEXT: movaps 464(%rsi), %xmm1
1951 ; SSE-NEXT: movaps %xmm5, %xmm7
1952 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0]
1953 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
1954 ; SSE-NEXT: movaps 480(%rdi), %xmm1
1955 ; SSE-NEXT: movaps 480(%rsi), %xmm3
1956 ; SSE-NEXT: movaps %xmm1, %xmm2
1957 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1958 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
1959 ; SSE-NEXT: movaps 496(%rdi), %xmm3
1960 ; SSE-NEXT: movaps 496(%rsi), %xmm4
1961 ; SSE-NEXT: movaps %xmm3, %xmm0
1962 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
1963 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1]
1964 ; SSE-NEXT: movaps %xmm3, 1008(%rdx)
1965 ; SSE-NEXT: movaps %xmm0, 992(%rdx)
1966 ; SSE-NEXT: movaps %xmm1, 976(%rdx)
1967 ; SSE-NEXT: movaps %xmm2, 960(%rdx)
1968 ; SSE-NEXT: movaps %xmm5, 944(%rdx)
1969 ; SSE-NEXT: movaps %xmm7, 928(%rdx)
1970 ; SSE-NEXT: movaps %xmm6, 912(%rdx)
1971 ; SSE-NEXT: movaps %xmm9, 896(%rdx)
1972 ; SSE-NEXT: movaps %xmm8, 880(%rdx)
1973 ; SSE-NEXT: movaps %xmm10, 864(%rdx)
1974 ; SSE-NEXT: movaps %xmm12, 848(%rdx)
1975 ; SSE-NEXT: movaps %xmm14, 832(%rdx)
1976 ; SSE-NEXT: movaps %xmm11, 816(%rdx)
1977 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1978 ; SSE-NEXT: movaps %xmm0, 800(%rdx)
1979 ; SSE-NEXT: movaps %xmm13, 784(%rdx)
1980 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1981 ; SSE-NEXT: movaps %xmm0, 768(%rdx)
1982 ; SSE-NEXT: movaps %xmm15, 752(%rdx)
1983 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1984 ; SSE-NEXT: movaps %xmm0, 736(%rdx)
1985 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1986 ; SSE-NEXT: movaps %xmm0, 720(%rdx)
1987 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1988 ; SSE-NEXT: movaps %xmm0, 704(%rdx)
1989 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1990 ; SSE-NEXT: movaps %xmm0, 688(%rdx)
1991 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1992 ; SSE-NEXT: movaps %xmm0, 672(%rdx)
1993 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1994 ; SSE-NEXT: movaps %xmm0, 656(%rdx)
1995 ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
1996 ; SSE-NEXT: movaps %xmm0, 640(%rdx)
1997 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1998 ; SSE-NEXT: movaps %xmm0, 624(%rdx)
1999 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2000 ; SSE-NEXT: movaps %xmm0, 608(%rdx)
2001 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2002 ; SSE-NEXT: movaps %xmm0, 592(%rdx)
2003 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2004 ; SSE-NEXT: movaps %xmm0, 576(%rdx)
2005 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2006 ; SSE-NEXT: movaps %xmm0, 560(%rdx)
2007 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2008 ; SSE-NEXT: movaps %xmm0, 544(%rdx)
2009 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2010 ; SSE-NEXT: movaps %xmm0, 528(%rdx)
2011 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2012 ; SSE-NEXT: movaps %xmm0, 512(%rdx)
2013 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2014 ; SSE-NEXT: movaps %xmm0, 496(%rdx)
2015 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2016 ; SSE-NEXT: movaps %xmm0, 480(%rdx)
2017 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2018 ; SSE-NEXT: movaps %xmm0, 464(%rdx)
2019 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2020 ; SSE-NEXT: movaps %xmm0, 448(%rdx)
2021 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2022 ; SSE-NEXT: movaps %xmm0, 432(%rdx)
2023 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2024 ; SSE-NEXT: movaps %xmm0, 416(%rdx)
2025 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2026 ; SSE-NEXT: movaps %xmm0, 400(%rdx)
2027 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2028 ; SSE-NEXT: movaps %xmm0, 384(%rdx)
2029 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2030 ; SSE-NEXT: movaps %xmm0, 368(%rdx)
2031 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2032 ; SSE-NEXT: movaps %xmm0, 352(%rdx)
2033 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2034 ; SSE-NEXT: movaps %xmm0, 336(%rdx)
2035 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2036 ; SSE-NEXT: movaps %xmm0, 320(%rdx)
2037 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2038 ; SSE-NEXT: movaps %xmm0, 304(%rdx)
2039 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2040 ; SSE-NEXT: movaps %xmm0, 288(%rdx)
2041 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2042 ; SSE-NEXT: movaps %xmm0, 272(%rdx)
2043 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2044 ; SSE-NEXT: movaps %xmm0, 256(%rdx)
2045 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2046 ; SSE-NEXT: movaps %xmm0, 240(%rdx)
2047 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2048 ; SSE-NEXT: movaps %xmm0, 224(%rdx)
2049 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2050 ; SSE-NEXT: movaps %xmm0, 208(%rdx)
2051 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2052 ; SSE-NEXT: movaps %xmm0, 192(%rdx)
2053 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2054 ; SSE-NEXT: movaps %xmm0, 176(%rdx)
2055 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2056 ; SSE-NEXT: movaps %xmm0, 160(%rdx)
2057 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2058 ; SSE-NEXT: movaps %xmm0, 144(%rdx)
2059 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2060 ; SSE-NEXT: movaps %xmm0, 128(%rdx)
2061 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2062 ; SSE-NEXT: movaps %xmm0, 112(%rdx)
2063 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2064 ; SSE-NEXT: movaps %xmm0, 96(%rdx)
2065 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2066 ; SSE-NEXT: movaps %xmm0, 80(%rdx)
2067 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2068 ; SSE-NEXT: movaps %xmm0, 64(%rdx)
2069 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2070 ; SSE-NEXT: movaps %xmm0, 48(%rdx)
2071 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2072 ; SSE-NEXT: movaps %xmm0, 32(%rdx)
2073 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2074 ; SSE-NEXT: movaps %xmm0, 16(%rdx)
2075 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2076 ; SSE-NEXT: movaps %xmm0, (%rdx)
2077 ; SSE-NEXT: addq $664, %rsp # imm = 0x298
2080 ; AVX-LABEL: store_i64_stride2_vf64:
2082 ; AVX-NEXT: subq $424, %rsp # imm = 0x1A8
2083 ; AVX-NEXT: vmovaps (%rsi), %xmm0
2084 ; AVX-NEXT: vmovaps 32(%rsi), %xmm1
2085 ; AVX-NEXT: vmovaps 64(%rsi), %xmm2
2086 ; AVX-NEXT: vmovaps 96(%rsi), %xmm3
2087 ; AVX-NEXT: vmovaps (%rdi), %xmm4
2088 ; AVX-NEXT: vmovaps 32(%rdi), %xmm5
2089 ; AVX-NEXT: vmovaps 64(%rdi), %xmm6
2090 ; AVX-NEXT: vmovaps 96(%rdi), %xmm7
2091 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm4[1],xmm0[1]
2092 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm0[0]
2093 ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0
2094 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2095 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm1[1]
2096 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm1[0]
2097 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2098 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2099 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm6[1],xmm2[1]
2100 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm2[0]
2101 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2102 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2103 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm3[1]
2104 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm3[0]
2105 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2106 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2107 ; AVX-NEXT: vmovaps 128(%rsi), %xmm0
2108 ; AVX-NEXT: vmovaps 128(%rdi), %xmm1
2109 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
2110 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2111 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2112 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2113 ; AVX-NEXT: vmovaps 160(%rsi), %xmm0
2114 ; AVX-NEXT: vmovaps 160(%rdi), %xmm1
2115 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
2116 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2117 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2118 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2119 ; AVX-NEXT: vmovaps 192(%rsi), %xmm0
2120 ; AVX-NEXT: vmovaps 192(%rdi), %xmm1
2121 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
2122 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2123 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2124 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2125 ; AVX-NEXT: vmovaps 224(%rsi), %xmm0
2126 ; AVX-NEXT: vmovaps 224(%rdi), %xmm1
2127 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
2128 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2129 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2130 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2131 ; AVX-NEXT: vmovaps 256(%rsi), %xmm0
2132 ; AVX-NEXT: vmovaps 256(%rdi), %xmm1
2133 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
2134 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2135 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2136 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2137 ; AVX-NEXT: vmovaps 288(%rsi), %xmm0
2138 ; AVX-NEXT: vmovaps 288(%rdi), %xmm1
2139 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
2140 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2141 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2142 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2143 ; AVX-NEXT: vmovaps 320(%rsi), %xmm0
2144 ; AVX-NEXT: vmovaps 320(%rdi), %xmm1
2145 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
2146 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2147 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2148 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2149 ; AVX-NEXT: vmovaps 352(%rsi), %xmm0
2150 ; AVX-NEXT: vmovaps 352(%rdi), %xmm1
2151 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
2152 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2153 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2154 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2155 ; AVX-NEXT: vmovaps 384(%rsi), %xmm0
2156 ; AVX-NEXT: vmovaps 384(%rdi), %xmm1
2157 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
2158 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2159 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2160 ; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
2161 ; AVX-NEXT: vmovaps 416(%rsi), %xmm0
2162 ; AVX-NEXT: vmovaps 416(%rdi), %xmm1
2163 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
2164 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2165 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2166 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2167 ; AVX-NEXT: vmovaps 448(%rsi), %xmm0
2168 ; AVX-NEXT: vmovaps 448(%rdi), %xmm1
2169 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
2170 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2171 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2172 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2173 ; AVX-NEXT: vmovaps 480(%rsi), %xmm0
2174 ; AVX-NEXT: vmovaps 480(%rdi), %xmm1
2175 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
2176 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2177 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2178 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2179 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3]
2180 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3]
2181 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
2182 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2183 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3]
2184 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3]
2185 ; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
2186 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3]
2187 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3]
2188 ; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
2189 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3]
2190 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3]
2191 ; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
2192 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3]
2193 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3]
2194 ; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
2195 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3]
2196 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3]
2197 ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
2198 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3]
2199 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3]
2200 ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
2201 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3]
2202 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3,2,3]
2203 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[3],ymm0[3]
2204 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3,2,3]
2205 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = mem[2,3,2,3]
2206 ; AVX-NEXT: vshufpd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[3],ymm8[3]
2207 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = mem[2,3,2,3]
2208 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm10 = mem[2,3,2,3]
2209 ; AVX-NEXT: vshufpd {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[3],ymm9[3]
2210 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm10 = mem[2,3,2,3]
2211 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = mem[2,3,2,3]
2212 ; AVX-NEXT: vshufpd {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[3],ymm10[3]
2213 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = mem[2,3,2,3]
2214 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm12 = mem[2,3,2,3]
2215 ; AVX-NEXT: vshufpd {{.*#+}} ymm11 = ymm12[0],ymm11[0],ymm12[3],ymm11[3]
2216 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm12 = mem[2,3,2,3]
2217 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm13 = mem[2,3,2,3]
2218 ; AVX-NEXT: vshufpd {{.*#+}} ymm12 = ymm13[0],ymm12[0],ymm13[3],ymm12[3]
2219 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm13 = mem[2,3,2,3]
2220 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = mem[2,3,2,3]
2221 ; AVX-NEXT: vshufpd {{.*#+}} ymm13 = ymm14[0],ymm13[0],ymm14[3],ymm13[3]
2222 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = mem[2,3,2,3]
2223 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm15 = mem[2,3,2,3]
2224 ; AVX-NEXT: vshufpd {{.*#+}} ymm14 = ymm15[0],ymm14[0],ymm15[3],ymm14[3]
2225 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm15 = mem[2,3,2,3]
2226 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = mem[2,3,2,3]
2227 ; AVX-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm15[0],ymm7[3],ymm15[3]
2228 ; AVX-NEXT: vmovapd %ymm7, 992(%rdx)
2229 ; AVX-NEXT: vmovapd %ymm14, 928(%rdx)
2230 ; AVX-NEXT: vmovapd %ymm13, 864(%rdx)
2231 ; AVX-NEXT: vmovapd %ymm12, 800(%rdx)
2232 ; AVX-NEXT: vmovapd %ymm11, 736(%rdx)
2233 ; AVX-NEXT: vmovapd %ymm10, 672(%rdx)
2234 ; AVX-NEXT: vmovapd %ymm9, 608(%rdx)
2235 ; AVX-NEXT: vmovapd %ymm8, 544(%rdx)
2236 ; AVX-NEXT: vmovapd %ymm0, 480(%rdx)
2237 ; AVX-NEXT: vmovapd %ymm1, 416(%rdx)
2238 ; AVX-NEXT: vmovapd %ymm2, 352(%rdx)
2239 ; AVX-NEXT: vmovapd %ymm3, 288(%rdx)
2240 ; AVX-NEXT: vmovapd %ymm4, 224(%rdx)
2241 ; AVX-NEXT: vmovapd %ymm5, 160(%rdx)
2242 ; AVX-NEXT: vmovapd %ymm6, 96(%rdx)
2243 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2244 ; AVX-NEXT: vmovaps %ymm0, 32(%rdx)
2245 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2246 ; AVX-NEXT: vmovaps %ymm0, 960(%rdx)
2247 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2248 ; AVX-NEXT: vmovaps %ymm0, 896(%rdx)
2249 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2250 ; AVX-NEXT: vmovaps %ymm0, 832(%rdx)
2251 ; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2252 ; AVX-NEXT: vmovaps %ymm0, 768(%rdx)
2253 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2254 ; AVX-NEXT: vmovaps %ymm0, 704(%rdx)
2255 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2256 ; AVX-NEXT: vmovaps %ymm0, 640(%rdx)
2257 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2258 ; AVX-NEXT: vmovaps %ymm0, 576(%rdx)
2259 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2260 ; AVX-NEXT: vmovaps %ymm0, 512(%rdx)
2261 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2262 ; AVX-NEXT: vmovaps %ymm0, 448(%rdx)
2263 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2264 ; AVX-NEXT: vmovaps %ymm0, 384(%rdx)
2265 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2266 ; AVX-NEXT: vmovaps %ymm0, 320(%rdx)
2267 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2268 ; AVX-NEXT: vmovaps %ymm0, 256(%rdx)
2269 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2270 ; AVX-NEXT: vmovaps %ymm0, 192(%rdx)
2271 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2272 ; AVX-NEXT: vmovaps %ymm0, 128(%rdx)
2273 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2274 ; AVX-NEXT: vmovaps %ymm0, 64(%rdx)
2275 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2276 ; AVX-NEXT: vmovaps %ymm0, (%rdx)
2277 ; AVX-NEXT: addq $424, %rsp # imm = 0x1A8
2278 ; AVX-NEXT: vzeroupper
2281 ; AVX2-LABEL: store_i64_stride2_vf64:
2283 ; AVX2-NEXT: subq $456, %rsp # imm = 0x1C8
2284 ; AVX2-NEXT: vmovaps 192(%rdi), %ymm0
2285 ; AVX2-NEXT: vmovaps 160(%rdi), %ymm1
2286 ; AVX2-NEXT: vmovaps 128(%rdi), %ymm2
2287 ; AVX2-NEXT: vmovaps (%rdi), %ymm7
2288 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm8
2289 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm6
2290 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm4
2291 ; AVX2-NEXT: vmovaps 192(%rsi), %ymm3
2292 ; AVX2-NEXT: vmovaps 160(%rsi), %ymm5
2293 ; AVX2-NEXT: vmovaps 128(%rsi), %ymm9
2294 ; AVX2-NEXT: vmovaps (%rsi), %ymm10
2295 ; AVX2-NEXT: vmovaps 32(%rsi), %ymm11
2296 ; AVX2-NEXT: vmovaps 64(%rsi), %ymm12
2297 ; AVX2-NEXT: vmovaps 96(%rsi), %ymm13
2298 ; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm10[0,0,2,1]
2299 ; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm7[0,1,1,3]
2300 ; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7]
2301 ; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2302 ; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,2,3]
2303 ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,3,3]
2304 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7]
2305 ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2306 ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[0,0,2,1]
2307 ; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm8[0,1,1,3]
2308 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3],ymm10[4,5],ymm7[6,7]
2309 ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2310 ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[0,2,2,3]
2311 ; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,3,3]
2312 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7]
2313 ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2314 ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm12[0,0,2,1]
2315 ; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm6[0,1,1,3]
2316 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7]
2317 ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2318 ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm12[0,2,2,3]
2319 ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3]
2320 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7]
2321 ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2322 ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm13[0,0,2,1]
2323 ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm4[0,1,1,3]
2324 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7]
2325 ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2326 ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm13[0,2,2,3]
2327 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3]
2328 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7]
2329 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2330 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm9[0,0,2,1]
2331 ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm2[0,1,1,3]
2332 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7]
2333 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2334 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm9[0,2,2,3]
2335 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3]
2336 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5],ymm4[6,7]
2337 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2338 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm5[0,0,2,1]
2339 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[0,1,1,3]
2340 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
2341 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2342 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm5[0,2,2,3]
2343 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3]
2344 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
2345 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2346 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,0,2,1]
2347 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,1,3]
2348 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
2349 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2350 ; AVX2-NEXT: vmovaps 224(%rdi), %ymm1
2351 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[0,2,2,3]
2352 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
2353 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
2354 ; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
2355 ; AVX2-NEXT: vmovaps 224(%rsi), %ymm0
2356 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,0,2,1]
2357 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[0,1,1,3]
2358 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
2359 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2360 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
2361 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3]
2362 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
2363 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2364 ; AVX2-NEXT: vmovaps 256(%rdi), %ymm0
2365 ; AVX2-NEXT: vmovaps 256(%rsi), %ymm1
2366 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
2367 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3]
2368 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
2369 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2370 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
2371 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
2372 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2373 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2374 ; AVX2-NEXT: vmovaps 288(%rdi), %ymm0
2375 ; AVX2-NEXT: vmovaps 288(%rsi), %ymm1
2376 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
2377 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3]
2378 ; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
2379 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
2380 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
2381 ; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2382 ; AVX2-NEXT: vmovaps 320(%rdi), %ymm0
2383 ; AVX2-NEXT: vmovaps 320(%rsi), %ymm1
2384 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
2385 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3]
2386 ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
2387 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
2388 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
2389 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2390 ; AVX2-NEXT: vmovaps 352(%rdi), %ymm0
2391 ; AVX2-NEXT: vmovaps 352(%rsi), %ymm1
2392 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
2393 ; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[0,1,1,3]
2394 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7]
2395 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
2396 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
2397 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2398 ; AVX2-NEXT: vmovaps 384(%rdi), %ymm9
2399 ; AVX2-NEXT: vmovaps 384(%rsi), %ymm0
2400 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,2,1]
2401 ; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm9[0,1,1,3]
2402 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7]
2403 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
2404 ; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3]
2405 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm0[2,3],ymm9[4,5],ymm0[6,7]
2406 ; AVX2-NEXT: vmovaps 416(%rdi), %ymm9
2407 ; AVX2-NEXT: vmovaps 416(%rsi), %ymm11
2408 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm11[0,0,2,1]
2409 ; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm9[0,1,1,3]
2410 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1],ymm0[2,3],ymm13[4,5],ymm0[6,7]
2411 ; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,2,3]
2412 ; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3]
2413 ; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm11[2,3],ymm9[4,5],ymm11[6,7]
2414 ; AVX2-NEXT: vmovaps 448(%rdi), %ymm11
2415 ; AVX2-NEXT: vmovaps 448(%rsi), %ymm13
2416 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,0,2,1]
2417 ; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm11[0,1,1,3]
2418 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7]
2419 ; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,2,3]
2420 ; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,3,3]
2421 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3],ymm11[4,5],ymm13[6,7]
2422 ; AVX2-NEXT: vmovaps 480(%rdi), %ymm13
2423 ; AVX2-NEXT: vmovaps 480(%rsi), %ymm15
2424 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm15[0,0,2,1]
2425 ; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm13[0,1,1,3]
2426 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3],ymm14[4,5],ymm0[6,7]
2427 ; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm15[0,2,2,3]
2428 ; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,3,3]
2429 ; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7]
2430 ; AVX2-NEXT: vmovaps %ymm13, 992(%rdx)
2431 ; AVX2-NEXT: vmovaps %ymm0, 960(%rdx)
2432 ; AVX2-NEXT: vmovaps %ymm11, 928(%rdx)
2433 ; AVX2-NEXT: vmovaps %ymm1, 896(%rdx)
2434 ; AVX2-NEXT: vmovaps %ymm9, 864(%rdx)
2435 ; AVX2-NEXT: vmovaps %ymm2, 832(%rdx)
2436 ; AVX2-NEXT: vmovaps %ymm3, 800(%rdx)
2437 ; AVX2-NEXT: vmovaps %ymm4, 768(%rdx)
2438 ; AVX2-NEXT: vmovaps %ymm5, 736(%rdx)
2439 ; AVX2-NEXT: vmovaps %ymm6, 704(%rdx)
2440 ; AVX2-NEXT: vmovaps %ymm7, 672(%rdx)
2441 ; AVX2-NEXT: vmovaps %ymm8, 640(%rdx)
2442 ; AVX2-NEXT: vmovaps %ymm10, 608(%rdx)
2443 ; AVX2-NEXT: vmovaps %ymm12, 576(%rdx)
2444 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2445 ; AVX2-NEXT: vmovaps %ymm0, 544(%rdx)
2446 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2447 ; AVX2-NEXT: vmovaps %ymm0, 512(%rdx)
2448 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2449 ; AVX2-NEXT: vmovaps %ymm0, 480(%rdx)
2450 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2451 ; AVX2-NEXT: vmovaps %ymm0, 448(%rdx)
2452 ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2453 ; AVX2-NEXT: vmovaps %ymm0, 416(%rdx)
2454 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2455 ; AVX2-NEXT: vmovaps %ymm0, 384(%rdx)
2456 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2457 ; AVX2-NEXT: vmovaps %ymm0, 352(%rdx)
2458 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2459 ; AVX2-NEXT: vmovaps %ymm0, 320(%rdx)
2460 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2461 ; AVX2-NEXT: vmovaps %ymm0, 288(%rdx)
2462 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2463 ; AVX2-NEXT: vmovaps %ymm0, 256(%rdx)
2464 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2465 ; AVX2-NEXT: vmovaps %ymm0, 224(%rdx)
2466 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2467 ; AVX2-NEXT: vmovaps %ymm0, 192(%rdx)
2468 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2469 ; AVX2-NEXT: vmovaps %ymm0, 160(%rdx)
2470 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2471 ; AVX2-NEXT: vmovaps %ymm0, 128(%rdx)
2472 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2473 ; AVX2-NEXT: vmovaps %ymm0, 96(%rdx)
2474 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2475 ; AVX2-NEXT: vmovaps %ymm0, 64(%rdx)
2476 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2477 ; AVX2-NEXT: vmovaps %ymm0, 32(%rdx)
2478 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2479 ; AVX2-NEXT: vmovaps %ymm0, (%rdx)
2480 ; AVX2-NEXT: addq $456, %rsp # imm = 0x1C8
2481 ; AVX2-NEXT: vzeroupper
2484 ; AVX2-FP-LABEL: store_i64_stride2_vf64:
2486 ; AVX2-FP-NEXT: subq $456, %rsp # imm = 0x1C8
2487 ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm0
2488 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm1
2489 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm2
2490 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm7
2491 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm8
2492 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm6
2493 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm4
2494 ; AVX2-FP-NEXT: vmovaps 192(%rsi), %ymm3
2495 ; AVX2-FP-NEXT: vmovaps 160(%rsi), %ymm5
2496 ; AVX2-FP-NEXT: vmovaps 128(%rsi), %ymm9
2497 ; AVX2-FP-NEXT: vmovaps (%rsi), %ymm10
2498 ; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm11
2499 ; AVX2-FP-NEXT: vmovaps 64(%rsi), %ymm12
2500 ; AVX2-FP-NEXT: vmovaps 96(%rsi), %ymm13
2501 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm10[0,0,2,1]
2502 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm7[0,1,1,3]
2503 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7]
2504 ; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2505 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,2,3]
2506 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,3,3]
2507 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7]
2508 ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2509 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[0,0,2,1]
2510 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm10 = ymm8[0,1,1,3]
2511 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3],ymm10[4,5],ymm7[6,7]
2512 ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2513 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[0,2,2,3]
2514 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,3,3]
2515 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7]
2516 ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2517 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm12[0,0,2,1]
2518 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm6[0,1,1,3]
2519 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7]
2520 ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2521 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm12[0,2,2,3]
2522 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3]
2523 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7]
2524 ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2525 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm13[0,0,2,1]
2526 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm4[0,1,1,3]
2527 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7]
2528 ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2529 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm13[0,2,2,3]
2530 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3]
2531 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7]
2532 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2533 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm9[0,0,2,1]
2534 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm2[0,1,1,3]
2535 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7]
2536 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2537 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm9[0,2,2,3]
2538 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3]
2539 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5],ymm4[6,7]
2540 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2541 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm5[0,0,2,1]
2542 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[0,1,1,3]
2543 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
2544 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2545 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm5[0,2,2,3]
2546 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3]
2547 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
2548 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2549 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,0,2,1]
2550 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,1,3]
2551 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
2552 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2553 ; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm1
2554 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[0,2,2,3]
2555 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
2556 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
2557 ; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
2558 ; AVX2-FP-NEXT: vmovaps 224(%rsi), %ymm0
2559 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,0,2,1]
2560 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[0,1,1,3]
2561 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
2562 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2563 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
2564 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3]
2565 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
2566 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2567 ; AVX2-FP-NEXT: vmovaps 256(%rdi), %ymm0
2568 ; AVX2-FP-NEXT: vmovaps 256(%rsi), %ymm1
2569 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
2570 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3]
2571 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
2572 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2573 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
2574 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
2575 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2576 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2577 ; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm0
2578 ; AVX2-FP-NEXT: vmovaps 288(%rsi), %ymm1
2579 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
2580 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3]
2581 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
2582 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
2583 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
2584 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2585 ; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm0
2586 ; AVX2-FP-NEXT: vmovaps 320(%rsi), %ymm1
2587 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
2588 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3]
2589 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
2590 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
2591 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
2592 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2593 ; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm0
2594 ; AVX2-FP-NEXT: vmovaps 352(%rsi), %ymm1
2595 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
2596 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[0,1,1,3]
2597 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7]
2598 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
2599 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
2600 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2601 ; AVX2-FP-NEXT: vmovaps 384(%rdi), %ymm9
2602 ; AVX2-FP-NEXT: vmovaps 384(%rsi), %ymm0
2603 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,2,1]
2604 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = ymm9[0,1,1,3]
2605 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7]
2606 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
2607 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3]
2608 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm0[2,3],ymm9[4,5],ymm0[6,7]
2609 ; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm9
2610 ; AVX2-FP-NEXT: vmovaps 416(%rsi), %ymm11
2611 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm11[0,0,2,1]
2612 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm9[0,1,1,3]
2613 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1],ymm0[2,3],ymm13[4,5],ymm0[6,7]
2614 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,2,3]
2615 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3]
2616 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm11[2,3],ymm9[4,5],ymm11[6,7]
2617 ; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm11
2618 ; AVX2-FP-NEXT: vmovaps 448(%rsi), %ymm13
2619 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,0,2,1]
2620 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm11[0,1,1,3]
2621 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7]
2622 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,2,3]
2623 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,3,3]
2624 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3],ymm11[4,5],ymm13[6,7]
2625 ; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm13
2626 ; AVX2-FP-NEXT: vmovaps 480(%rsi), %ymm15
2627 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm15[0,0,2,1]
2628 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm13[0,1,1,3]
2629 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3],ymm14[4,5],ymm0[6,7]
2630 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm15[0,2,2,3]
2631 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,3,3]
2632 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7]
2633 ; AVX2-FP-NEXT: vmovaps %ymm13, 992(%rdx)
2634 ; AVX2-FP-NEXT: vmovaps %ymm0, 960(%rdx)
2635 ; AVX2-FP-NEXT: vmovaps %ymm11, 928(%rdx)
2636 ; AVX2-FP-NEXT: vmovaps %ymm1, 896(%rdx)
2637 ; AVX2-FP-NEXT: vmovaps %ymm9, 864(%rdx)
2638 ; AVX2-FP-NEXT: vmovaps %ymm2, 832(%rdx)
2639 ; AVX2-FP-NEXT: vmovaps %ymm3, 800(%rdx)
2640 ; AVX2-FP-NEXT: vmovaps %ymm4, 768(%rdx)
2641 ; AVX2-FP-NEXT: vmovaps %ymm5, 736(%rdx)
2642 ; AVX2-FP-NEXT: vmovaps %ymm6, 704(%rdx)
2643 ; AVX2-FP-NEXT: vmovaps %ymm7, 672(%rdx)
2644 ; AVX2-FP-NEXT: vmovaps %ymm8, 640(%rdx)
2645 ; AVX2-FP-NEXT: vmovaps %ymm10, 608(%rdx)
2646 ; AVX2-FP-NEXT: vmovaps %ymm12, 576(%rdx)
2647 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2648 ; AVX2-FP-NEXT: vmovaps %ymm0, 544(%rdx)
2649 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2650 ; AVX2-FP-NEXT: vmovaps %ymm0, 512(%rdx)
2651 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2652 ; AVX2-FP-NEXT: vmovaps %ymm0, 480(%rdx)
2653 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2654 ; AVX2-FP-NEXT: vmovaps %ymm0, 448(%rdx)
2655 ; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2656 ; AVX2-FP-NEXT: vmovaps %ymm0, 416(%rdx)
2657 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2658 ; AVX2-FP-NEXT: vmovaps %ymm0, 384(%rdx)
2659 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2660 ; AVX2-FP-NEXT: vmovaps %ymm0, 352(%rdx)
2661 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2662 ; AVX2-FP-NEXT: vmovaps %ymm0, 320(%rdx)
2663 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2664 ; AVX2-FP-NEXT: vmovaps %ymm0, 288(%rdx)
2665 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2666 ; AVX2-FP-NEXT: vmovaps %ymm0, 256(%rdx)
2667 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2668 ; AVX2-FP-NEXT: vmovaps %ymm0, 224(%rdx)
2669 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2670 ; AVX2-FP-NEXT: vmovaps %ymm0, 192(%rdx)
2671 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2672 ; AVX2-FP-NEXT: vmovaps %ymm0, 160(%rdx)
2673 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2674 ; AVX2-FP-NEXT: vmovaps %ymm0, 128(%rdx)
2675 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2676 ; AVX2-FP-NEXT: vmovaps %ymm0, 96(%rdx)
2677 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2678 ; AVX2-FP-NEXT: vmovaps %ymm0, 64(%rdx)
2679 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2680 ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rdx)
2681 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2682 ; AVX2-FP-NEXT: vmovaps %ymm0, (%rdx)
2683 ; AVX2-FP-NEXT: addq $456, %rsp # imm = 0x1C8
2684 ; AVX2-FP-NEXT: vzeroupper
2685 ; AVX2-FP-NEXT: retq
2687 ; AVX2-FCP-LABEL: store_i64_stride2_vf64:
2688 ; AVX2-FCP: # %bb.0:
2689 ; AVX2-FCP-NEXT: subq $456, %rsp # imm = 0x1C8
2690 ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm0
2691 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm1
2692 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm2
2693 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm7
2694 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm8
2695 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm6
2696 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm4
2697 ; AVX2-FCP-NEXT: vmovaps 192(%rsi), %ymm3
2698 ; AVX2-FCP-NEXT: vmovaps 160(%rsi), %ymm5
2699 ; AVX2-FCP-NEXT: vmovaps 128(%rsi), %ymm9
2700 ; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm10
2701 ; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm11
2702 ; AVX2-FCP-NEXT: vmovaps 64(%rsi), %ymm12
2703 ; AVX2-FCP-NEXT: vmovaps 96(%rsi), %ymm13
2704 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm10[0,0,2,1]
2705 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm7[0,1,1,3]
2706 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7]
2707 ; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2708 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,2,3]
2709 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,3,3]
2710 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7]
2711 ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2712 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[0,0,2,1]
2713 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm8[0,1,1,3]
2714 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3],ymm10[4,5],ymm7[6,7]
2715 ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2716 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[0,2,2,3]
2717 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,3,3]
2718 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7]
2719 ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2720 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm12[0,0,2,1]
2721 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm6[0,1,1,3]
2722 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7]
2723 ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2724 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm12[0,2,2,3]
2725 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3]
2726 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7]
2727 ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2728 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm13[0,0,2,1]
2729 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm4[0,1,1,3]
2730 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7]
2731 ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2732 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm13[0,2,2,3]
2733 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3]
2734 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7]
2735 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2736 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm9[0,0,2,1]
2737 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm2[0,1,1,3]
2738 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7]
2739 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2740 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm9[0,2,2,3]
2741 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3]
2742 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5],ymm4[6,7]
2743 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2744 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm5[0,0,2,1]
2745 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[0,1,1,3]
2746 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
2747 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2748 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm5[0,2,2,3]
2749 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3]
2750 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
2751 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2752 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,0,2,1]
2753 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,1,3]
2754 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
2755 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2756 ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm1
2757 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[0,2,2,3]
2758 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
2759 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
2760 ; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
2761 ; AVX2-FCP-NEXT: vmovaps 224(%rsi), %ymm0
2762 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,0,2,1]
2763 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[0,1,1,3]
2764 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
2765 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2766 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
2767 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3]
2768 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
2769 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2770 ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm0
2771 ; AVX2-FCP-NEXT: vmovaps 256(%rsi), %ymm1
2772 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
2773 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3]
2774 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
2775 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2776 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
2777 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
2778 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2779 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2780 ; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm0
2781 ; AVX2-FCP-NEXT: vmovaps 288(%rsi), %ymm1
2782 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
2783 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3]
2784 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
2785 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
2786 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
2787 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2788 ; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm0
2789 ; AVX2-FCP-NEXT: vmovaps 320(%rsi), %ymm1
2790 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
2791 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3]
2792 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
2793 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
2794 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
2795 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2796 ; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm0
2797 ; AVX2-FCP-NEXT: vmovaps 352(%rsi), %ymm1
2798 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
2799 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[0,1,1,3]
2800 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7]
2801 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
2802 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
2803 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2804 ; AVX2-FCP-NEXT: vmovaps 384(%rdi), %ymm9
2805 ; AVX2-FCP-NEXT: vmovaps 384(%rsi), %ymm0
2806 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,2,1]
2807 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm9[0,1,1,3]
2808 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7]
2809 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
2810 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3]
2811 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm0[2,3],ymm9[4,5],ymm0[6,7]
2812 ; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm9
2813 ; AVX2-FCP-NEXT: vmovaps 416(%rsi), %ymm11
2814 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm11[0,0,2,1]
2815 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm13 = ymm9[0,1,1,3]
2816 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1],ymm0[2,3],ymm13[4,5],ymm0[6,7]
2817 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,2,3]
2818 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3]
2819 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm11[2,3],ymm9[4,5],ymm11[6,7]
2820 ; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm11
2821 ; AVX2-FCP-NEXT: vmovaps 448(%rsi), %ymm13
2822 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,0,2,1]
2823 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm11[0,1,1,3]
2824 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7]
2825 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,2,3]
2826 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,3,3]
2827 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3],ymm11[4,5],ymm13[6,7]
2828 ; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm13
2829 ; AVX2-FCP-NEXT: vmovaps 480(%rsi), %ymm15
2830 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm15[0,0,2,1]
2831 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm13[0,1,1,3]
2832 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3],ymm14[4,5],ymm0[6,7]
2833 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm15[0,2,2,3]
2834 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,3,3]
2835 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7]
2836 ; AVX2-FCP-NEXT: vmovaps %ymm13, 992(%rdx)
2837 ; AVX2-FCP-NEXT: vmovaps %ymm0, 960(%rdx)
2838 ; AVX2-FCP-NEXT: vmovaps %ymm11, 928(%rdx)
2839 ; AVX2-FCP-NEXT: vmovaps %ymm1, 896(%rdx)
2840 ; AVX2-FCP-NEXT: vmovaps %ymm9, 864(%rdx)
2841 ; AVX2-FCP-NEXT: vmovaps %ymm2, 832(%rdx)
2842 ; AVX2-FCP-NEXT: vmovaps %ymm3, 800(%rdx)
2843 ; AVX2-FCP-NEXT: vmovaps %ymm4, 768(%rdx)
2844 ; AVX2-FCP-NEXT: vmovaps %ymm5, 736(%rdx)
2845 ; AVX2-FCP-NEXT: vmovaps %ymm6, 704(%rdx)
2846 ; AVX2-FCP-NEXT: vmovaps %ymm7, 672(%rdx)
2847 ; AVX2-FCP-NEXT: vmovaps %ymm8, 640(%rdx)
2848 ; AVX2-FCP-NEXT: vmovaps %ymm10, 608(%rdx)
2849 ; AVX2-FCP-NEXT: vmovaps %ymm12, 576(%rdx)
2850 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2851 ; AVX2-FCP-NEXT: vmovaps %ymm0, 544(%rdx)
2852 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2853 ; AVX2-FCP-NEXT: vmovaps %ymm0, 512(%rdx)
2854 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2855 ; AVX2-FCP-NEXT: vmovaps %ymm0, 480(%rdx)
2856 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2857 ; AVX2-FCP-NEXT: vmovaps %ymm0, 448(%rdx)
2858 ; AVX2-FCP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2859 ; AVX2-FCP-NEXT: vmovaps %ymm0, 416(%rdx)
2860 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2861 ; AVX2-FCP-NEXT: vmovaps %ymm0, 384(%rdx)
2862 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2863 ; AVX2-FCP-NEXT: vmovaps %ymm0, 352(%rdx)
2864 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2865 ; AVX2-FCP-NEXT: vmovaps %ymm0, 320(%rdx)
2866 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2867 ; AVX2-FCP-NEXT: vmovaps %ymm0, 288(%rdx)
2868 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2869 ; AVX2-FCP-NEXT: vmovaps %ymm0, 256(%rdx)
2870 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2871 ; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%rdx)
2872 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2873 ; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%rdx)
2874 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2875 ; AVX2-FCP-NEXT: vmovaps %ymm0, 160(%rdx)
2876 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2877 ; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%rdx)
2878 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2879 ; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%rdx)
2880 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2881 ; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rdx)
2882 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2883 ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rdx)
2884 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2885 ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rdx)
2886 ; AVX2-FCP-NEXT: addq $456, %rsp # imm = 0x1C8
2887 ; AVX2-FCP-NEXT: vzeroupper
2888 ; AVX2-FCP-NEXT: retq
2890 ; AVX512-LABEL: store_i64_stride2_vf64:
2892 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm0
2893 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm1
2894 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm2
2895 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm3
2896 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm4
2897 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm5
2898 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm6
2899 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm7
2900 ; AVX512-NEXT: vmovdqa64 448(%rsi), %zmm8
2901 ; AVX512-NEXT: vmovdqa64 384(%rsi), %zmm9
2902 ; AVX512-NEXT: vmovdqa64 320(%rsi), %zmm10
2903 ; AVX512-NEXT: vmovdqa64 256(%rsi), %zmm11
2904 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm12
2905 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm13
2906 ; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm14
2907 ; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm15
2908 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15]
2909 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm17
2910 ; AVX512-NEXT: vpermt2q %zmm12, %zmm16, %zmm17
2911 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11]
2912 ; AVX512-NEXT: vpermt2q %zmm12, %zmm18, %zmm4
2913 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm12
2914 ; AVX512-NEXT: vpermt2q %zmm13, %zmm16, %zmm12
2915 ; AVX512-NEXT: vpermt2q %zmm13, %zmm18, %zmm5
2916 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm13
2917 ; AVX512-NEXT: vpermt2q %zmm14, %zmm16, %zmm13
2918 ; AVX512-NEXT: vpermt2q %zmm14, %zmm18, %zmm6
2919 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm14
2920 ; AVX512-NEXT: vpermt2q %zmm15, %zmm16, %zmm14
2921 ; AVX512-NEXT: vpermt2q %zmm15, %zmm18, %zmm7
2922 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm15
2923 ; AVX512-NEXT: vpermt2q %zmm11, %zmm16, %zmm15
2924 ; AVX512-NEXT: vpermt2q %zmm11, %zmm18, %zmm3
2925 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm11
2926 ; AVX512-NEXT: vpermt2q %zmm10, %zmm16, %zmm11
2927 ; AVX512-NEXT: vpermt2q %zmm10, %zmm18, %zmm2
2928 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10
2929 ; AVX512-NEXT: vpermt2q %zmm9, %zmm16, %zmm10
2930 ; AVX512-NEXT: vpermt2q %zmm9, %zmm18, %zmm1
2931 ; AVX512-NEXT: vpermi2q %zmm8, %zmm0, %zmm16
2932 ; AVX512-NEXT: vpermt2q %zmm8, %zmm18, %zmm0
2933 ; AVX512-NEXT: vmovdqa64 %zmm0, 896(%rdx)
2934 ; AVX512-NEXT: vmovdqa64 %zmm16, 960(%rdx)
2935 ; AVX512-NEXT: vmovdqa64 %zmm1, 768(%rdx)
2936 ; AVX512-NEXT: vmovdqa64 %zmm10, 832(%rdx)
2937 ; AVX512-NEXT: vmovdqa64 %zmm2, 640(%rdx)
2938 ; AVX512-NEXT: vmovdqa64 %zmm11, 704(%rdx)
2939 ; AVX512-NEXT: vmovdqa64 %zmm3, 512(%rdx)
2940 ; AVX512-NEXT: vmovdqa64 %zmm15, 576(%rdx)
2941 ; AVX512-NEXT: vmovdqa64 %zmm7, 384(%rdx)
2942 ; AVX512-NEXT: vmovdqa64 %zmm14, 448(%rdx)
2943 ; AVX512-NEXT: vmovdqa64 %zmm6, 256(%rdx)
2944 ; AVX512-NEXT: vmovdqa64 %zmm13, 320(%rdx)
2945 ; AVX512-NEXT: vmovdqa64 %zmm5, 128(%rdx)
2946 ; AVX512-NEXT: vmovdqa64 %zmm12, 192(%rdx)
2947 ; AVX512-NEXT: vmovdqa64 %zmm4, (%rdx)
2948 ; AVX512-NEXT: vmovdqa64 %zmm17, 64(%rdx)
2949 ; AVX512-NEXT: vzeroupper
2952 ; AVX512-FCP-LABEL: store_i64_stride2_vf64:
2953 ; AVX512-FCP: # %bb.0:
2954 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm0
2955 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1
2956 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm2
2957 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3
2958 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm4
2959 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5
2960 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6
2961 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm7
2962 ; AVX512-FCP-NEXT: vmovdqa64 448(%rsi), %zmm8
2963 ; AVX512-FCP-NEXT: vmovdqa64 384(%rsi), %zmm9
2964 ; AVX512-FCP-NEXT: vmovdqa64 320(%rsi), %zmm10
2965 ; AVX512-FCP-NEXT: vmovdqa64 256(%rsi), %zmm11
2966 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm12
2967 ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm13
2968 ; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm14
2969 ; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm15
2970 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15]
2971 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm17
2972 ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm17
2973 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11]
2974 ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm18, %zmm4
2975 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm12
2976 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm12
2977 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm18, %zmm5
2978 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm13
2979 ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm16, %zmm13
2980 ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm18, %zmm6
2981 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm14
2982 ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm16, %zmm14
2983 ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm18, %zmm7
2984 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm15
2985 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm16, %zmm15
2986 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm18, %zmm3
2987 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm11
2988 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm11
2989 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm2
2990 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm10
2991 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm16, %zmm10
2992 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm1
2993 ; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm0, %zmm16
2994 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm18, %zmm0
2995 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 896(%rdx)
2996 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 960(%rdx)
2997 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 768(%rdx)
2998 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 832(%rdx)
2999 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 640(%rdx)
3000 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 704(%rdx)
3001 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 512(%rdx)
3002 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 576(%rdx)
3003 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 384(%rdx)
3004 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 448(%rdx)
3005 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 256(%rdx)
3006 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 320(%rdx)
3007 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 128(%rdx)
3008 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 192(%rdx)
3009 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rdx)
3010 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 64(%rdx)
3011 ; AVX512-FCP-NEXT: vzeroupper
3012 ; AVX512-FCP-NEXT: retq
3014 ; AVX512DQ-LABEL: store_i64_stride2_vf64:
3015 ; AVX512DQ: # %bb.0:
3016 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm0
3017 ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm1
3018 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm2
3019 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm3
3020 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm4
3021 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm5
3022 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm6
3023 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm7
3024 ; AVX512DQ-NEXT: vmovdqa64 448(%rsi), %zmm8
3025 ; AVX512DQ-NEXT: vmovdqa64 384(%rsi), %zmm9
3026 ; AVX512DQ-NEXT: vmovdqa64 320(%rsi), %zmm10
3027 ; AVX512DQ-NEXT: vmovdqa64 256(%rsi), %zmm11
3028 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm12
3029 ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm13
3030 ; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm14
3031 ; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm15
3032 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15]
3033 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm17
3034 ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm16, %zmm17
3035 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11]
3036 ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm18, %zmm4
3037 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm12
3038 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm16, %zmm12
3039 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm18, %zmm5
3040 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm13
3041 ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm16, %zmm13
3042 ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm18, %zmm6
3043 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm14
3044 ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm16, %zmm14
3045 ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm18, %zmm7
3046 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm15
3047 ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm16, %zmm15
3048 ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm18, %zmm3
3049 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm11
3050 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm16, %zmm11
3051 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm18, %zmm2
3052 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm10
3053 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm16, %zmm10
3054 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm18, %zmm1
3055 ; AVX512DQ-NEXT: vpermi2q %zmm8, %zmm0, %zmm16
3056 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm18, %zmm0
3057 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 896(%rdx)
3058 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 960(%rdx)
3059 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 768(%rdx)
3060 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 832(%rdx)
3061 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 640(%rdx)
3062 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 704(%rdx)
3063 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 512(%rdx)
3064 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, 576(%rdx)
3065 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 384(%rdx)
3066 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, 448(%rdx)
3067 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 256(%rdx)
3068 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 320(%rdx)
3069 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 128(%rdx)
3070 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, 192(%rdx)
3071 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rdx)
3072 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, 64(%rdx)
3073 ; AVX512DQ-NEXT: vzeroupper
3074 ; AVX512DQ-NEXT: retq
3076 ; AVX512DQ-FCP-LABEL: store_i64_stride2_vf64:
3077 ; AVX512DQ-FCP: # %bb.0:
3078 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm0
3079 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1
3080 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm2
3081 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3
3082 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm4
3083 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5
3084 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6
3085 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm7
3086 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rsi), %zmm8
3087 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rsi), %zmm9
3088 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rsi), %zmm10
3089 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rsi), %zmm11
3090 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm12
3091 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm13
3092 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm14
3093 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm15
3094 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15]
3095 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm17
3096 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm17
3097 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11]
3098 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm18, %zmm4
3099 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm12
3100 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm12
3101 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm18, %zmm5
3102 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm13
3103 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm16, %zmm13
3104 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm18, %zmm6
3105 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm14
3106 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm16, %zmm14
3107 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm18, %zmm7
3108 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm15
3109 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm16, %zmm15
3110 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm18, %zmm3
3111 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm11
3112 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm11
3113 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm2
3114 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10
3115 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm16, %zmm10
3116 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm1
3117 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm8, %zmm0, %zmm16
3118 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm18, %zmm0
3119 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 896(%rdx)
3120 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 960(%rdx)
3121 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 768(%rdx)
3122 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 832(%rdx)
3123 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 640(%rdx)
3124 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 704(%rdx)
3125 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 512(%rdx)
3126 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 576(%rdx)
3127 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 384(%rdx)
3128 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 448(%rdx)
3129 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 256(%rdx)
3130 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 320(%rdx)
3131 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 128(%rdx)
3132 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 192(%rdx)
3133 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rdx)
3134 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 64(%rdx)
3135 ; AVX512DQ-FCP-NEXT: vzeroupper
3136 ; AVX512DQ-FCP-NEXT: retq
3138 ; AVX512BW-LABEL: store_i64_stride2_vf64:
3139 ; AVX512BW: # %bb.0:
3140 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm0
3141 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm1
3142 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm2
3143 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3
3144 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm4
3145 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm5
3146 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm6
3147 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm7
3148 ; AVX512BW-NEXT: vmovdqa64 448(%rsi), %zmm8
3149 ; AVX512BW-NEXT: vmovdqa64 384(%rsi), %zmm9
3150 ; AVX512BW-NEXT: vmovdqa64 320(%rsi), %zmm10
3151 ; AVX512BW-NEXT: vmovdqa64 256(%rsi), %zmm11
3152 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm12
3153 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm13
3154 ; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm14
3155 ; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm15
3156 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15]
3157 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm17
3158 ; AVX512BW-NEXT: vpermt2q %zmm12, %zmm16, %zmm17
3159 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11]
3160 ; AVX512BW-NEXT: vpermt2q %zmm12, %zmm18, %zmm4
3161 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12
3162 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm16, %zmm12
3163 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm18, %zmm5
3164 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13
3165 ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm16, %zmm13
3166 ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm18, %zmm6
3167 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm14
3168 ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm16, %zmm14
3169 ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm18, %zmm7
3170 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm15
3171 ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm16, %zmm15
3172 ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm18, %zmm3
3173 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11
3174 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm16, %zmm11
3175 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm18, %zmm2
3176 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10
3177 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm16, %zmm10
3178 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm1
3179 ; AVX512BW-NEXT: vpermi2q %zmm8, %zmm0, %zmm16
3180 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm18, %zmm0
3181 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 896(%rdx)
3182 ; AVX512BW-NEXT: vmovdqa64 %zmm16, 960(%rdx)
3183 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 768(%rdx)
3184 ; AVX512BW-NEXT: vmovdqa64 %zmm10, 832(%rdx)
3185 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 640(%rdx)
3186 ; AVX512BW-NEXT: vmovdqa64 %zmm11, 704(%rdx)
3187 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 512(%rdx)
3188 ; AVX512BW-NEXT: vmovdqa64 %zmm15, 576(%rdx)
3189 ; AVX512BW-NEXT: vmovdqa64 %zmm7, 384(%rdx)
3190 ; AVX512BW-NEXT: vmovdqa64 %zmm14, 448(%rdx)
3191 ; AVX512BW-NEXT: vmovdqa64 %zmm6, 256(%rdx)
3192 ; AVX512BW-NEXT: vmovdqa64 %zmm13, 320(%rdx)
3193 ; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%rdx)
3194 ; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rdx)
3195 ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rdx)
3196 ; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rdx)
3197 ; AVX512BW-NEXT: vzeroupper
3198 ; AVX512BW-NEXT: retq
3200 ; AVX512BW-FCP-LABEL: store_i64_stride2_vf64:
3201 ; AVX512BW-FCP: # %bb.0:
3202 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm0
3203 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1
3204 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm2
3205 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3
3206 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4
3207 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5
3208 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6
3209 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm7
3210 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rsi), %zmm8
3211 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rsi), %zmm9
3212 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rsi), %zmm10
3213 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rsi), %zmm11
3214 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm12
3215 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm13
3216 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm14
3217 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm15
3218 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15]
3219 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm17
3220 ; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm17
3221 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11]
3222 ; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm18, %zmm4
3223 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12
3224 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm12
3225 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm18, %zmm5
3226 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13
3227 ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm16, %zmm13
3228 ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm18, %zmm6
3229 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm14
3230 ; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm16, %zmm14
3231 ; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm18, %zmm7
3232 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15
3233 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm16, %zmm15
3234 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm18, %zmm3
3235 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11
3236 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm11
3237 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm2
3238 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10
3239 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm16, %zmm10
3240 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm1
3241 ; AVX512BW-FCP-NEXT: vpermi2q %zmm8, %zmm0, %zmm16
3242 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm18, %zmm0
3243 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 896(%rdx)
3244 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 960(%rdx)
3245 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 768(%rdx)
3246 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 832(%rdx)
3247 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 640(%rdx)
3248 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 704(%rdx)
3249 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 512(%rdx)
3250 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 576(%rdx)
3251 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 384(%rdx)
3252 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 448(%rdx)
3253 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 256(%rdx)
3254 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 320(%rdx)
3255 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%rdx)
3256 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 192(%rdx)
3257 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rdx)
3258 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 64(%rdx)
3259 ; AVX512BW-FCP-NEXT: vzeroupper
3260 ; AVX512BW-FCP-NEXT: retq
3262 ; AVX512DQ-BW-LABEL: store_i64_stride2_vf64:
3263 ; AVX512DQ-BW: # %bb.0:
3264 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm0
3265 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm1
3266 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm2
3267 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm3
3268 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm4
3269 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm5
3270 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm6
3271 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm7
3272 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rsi), %zmm8
3273 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rsi), %zmm9
3274 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rsi), %zmm10
3275 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rsi), %zmm11
3276 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm12
3277 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm13
3278 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm14
3279 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm15
3280 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15]
3281 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm17
3282 ; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm16, %zmm17
3283 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11]
3284 ; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm18, %zmm4
3285 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm12
3286 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm16, %zmm12
3287 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm18, %zmm5
3288 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm13
3289 ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm16, %zmm13
3290 ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm18, %zmm6
3291 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm14
3292 ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm16, %zmm14
3293 ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm18, %zmm7
3294 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm15
3295 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm16, %zmm15
3296 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm18, %zmm3
3297 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm11
3298 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm16, %zmm11
3299 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm18, %zmm2
3300 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10
3301 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm16, %zmm10
3302 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm1
3303 ; AVX512DQ-BW-NEXT: vpermi2q %zmm8, %zmm0, %zmm16
3304 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm18, %zmm0
3305 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 896(%rdx)
3306 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 960(%rdx)
3307 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 768(%rdx)
3308 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 832(%rdx)
3309 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 640(%rdx)
3310 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 704(%rdx)
3311 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 512(%rdx)
3312 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 576(%rdx)
3313 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 384(%rdx)
3314 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 448(%rdx)
3315 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 256(%rdx)
3316 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 320(%rdx)
3317 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 128(%rdx)
3318 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 192(%rdx)
3319 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rdx)
3320 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 64(%rdx)
3321 ; AVX512DQ-BW-NEXT: vzeroupper
3322 ; AVX512DQ-BW-NEXT: retq
3324 ; AVX512DQ-BW-FCP-LABEL: store_i64_stride2_vf64:
3325 ; AVX512DQ-BW-FCP: # %bb.0:
3326 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm0
3327 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1
3328 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm2
3329 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3
3330 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4
3331 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5
3332 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6
3333 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm7
3334 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rsi), %zmm8
3335 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rsi), %zmm9
3336 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rsi), %zmm10
3337 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rsi), %zmm11
3338 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm12
3339 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm13
3340 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm14
3341 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm15
3342 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15]
3343 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm17
3344 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm17
3345 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11]
3346 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm18, %zmm4
3347 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12
3348 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm12
3349 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm18, %zmm5
3350 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13
3351 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm16, %zmm13
3352 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm18, %zmm6
3353 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm14
3354 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm16, %zmm14
3355 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm18, %zmm7
3356 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15
3357 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm16, %zmm15
3358 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm18, %zmm3
3359 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11
3360 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm11
3361 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm2
3362 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10
3363 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm16, %zmm10
3364 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm1
3365 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm8, %zmm0, %zmm16
3366 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm18, %zmm0
3367 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 896(%rdx)
3368 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 960(%rdx)
3369 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 768(%rdx)
3370 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 832(%rdx)
3371 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 640(%rdx)
3372 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 704(%rdx)
3373 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 512(%rdx)
3374 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 576(%rdx)
3375 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 384(%rdx)
3376 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 448(%rdx)
3377 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 256(%rdx)
3378 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 320(%rdx)
3379 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%rdx)
3380 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 192(%rdx)
3381 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rdx)
3382 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 64(%rdx)
3383 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
3384 ; AVX512DQ-BW-FCP-NEXT: retq
3385 %in.vec0 = load <64 x i64>, ptr %in.vecptr0, align 64
3386 %in.vec1 = load <64 x i64>, ptr %in.vecptr1, align 64
3387 %1 = shufflevector <64 x i64> %in.vec0, <64 x i64> %in.vec1, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
3388 %interleaved.vec = shufflevector <128 x i64> %1, <128 x i64> poison, <128 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
3389 store <128 x i64> %interleaved.vec, ptr %out.vec, align 64