1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,FALLBACK0
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-ONLY,FALLBACK1
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-SLOW,FALLBACK2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST,FALLBACK3
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST-PERLANE,FALLBACK4
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512F-ONLY-SLOW,FALLBACK5
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512F-ONLY-FAST,FALLBACK6
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512DQ-SLOW,FALLBACK7
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512DQ-FAST,FALLBACK8
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-SLOW,AVX512BW-SLOW,AVX512BW-ONLY-SLOW,FALLBACK9
12 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512BW-ONLY-FAST,FALLBACK10
13 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-SLOW,AVX512BW-SLOW,AVX512DQBW-SLOW,FALLBACK11
14 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512DQBW-FAST,FALLBACK12
16 ; These patterns are produced by LoopVectorizer for interleaved stores.
18 define void @store_i64_stride2_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
19 ; SSE-LABEL: store_i64_stride2_vf2:
21 ; SSE-NEXT: movaps (%rdi), %xmm0
22 ; SSE-NEXT: movaps (%rsi), %xmm1
23 ; SSE-NEXT: movaps %xmm0, %xmm2
24 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
25 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
26 ; SSE-NEXT: movaps %xmm0, 16(%rdx)
27 ; SSE-NEXT: movaps %xmm2, (%rdx)
30 ; AVX1-ONLY-LABEL: store_i64_stride2_vf2:
32 ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
33 ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
34 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
35 ; AVX1-ONLY-NEXT: vmovapd %ymm0, (%rdx)
36 ; AVX1-ONLY-NEXT: vzeroupper
37 ; AVX1-ONLY-NEXT: retq
39 ; AVX2-LABEL: store_i64_stride2_vf2:
41 ; AVX2-NEXT: vmovaps (%rdi), %xmm0
42 ; AVX2-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
43 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
44 ; AVX2-NEXT: vmovaps %ymm0, (%rdx)
45 ; AVX2-NEXT: vzeroupper
47 %in.vec0 = load <2 x i64>, ptr %in.vecptr0, align 64
48 %in.vec1 = load <2 x i64>, ptr %in.vecptr1, align 64
49 %1 = shufflevector <2 x i64> %in.vec0, <2 x i64> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
50 %interleaved.vec = shufflevector <4 x i64> %1, <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
51 store <4 x i64> %interleaved.vec, ptr %out.vec, align 64
55 define void @store_i64_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
56 ; SSE-LABEL: store_i64_stride2_vf4:
58 ; SSE-NEXT: movaps (%rdi), %xmm0
59 ; SSE-NEXT: movaps 16(%rdi), %xmm1
60 ; SSE-NEXT: movaps (%rsi), %xmm2
61 ; SSE-NEXT: movaps 16(%rsi), %xmm3
62 ; SSE-NEXT: movaps %xmm0, %xmm4
63 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
64 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
65 ; SSE-NEXT: movaps %xmm1, %xmm2
66 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
67 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
68 ; SSE-NEXT: movaps %xmm1, 32(%rdx)
69 ; SSE-NEXT: movaps %xmm2, 48(%rdx)
70 ; SSE-NEXT: movaps %xmm0, (%rdx)
71 ; SSE-NEXT: movaps %xmm4, 16(%rdx)
74 ; AVX1-ONLY-LABEL: store_i64_stride2_vf4:
76 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm0
77 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1
78 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
79 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
80 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
81 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3]
82 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3,2,3]
83 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[3],ymm1[3]
84 ; AVX1-ONLY-NEXT: vmovapd %ymm1, 32(%rdx)
85 ; AVX1-ONLY-NEXT: vmovapd %ymm0, (%rdx)
86 ; AVX1-ONLY-NEXT: vzeroupper
87 ; AVX1-ONLY-NEXT: retq
89 ; AVX2-ONLY-LABEL: store_i64_stride2_vf4:
91 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0
92 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm1
93 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
94 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3]
95 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
96 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
97 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
98 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
99 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx)
100 ; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx)
101 ; AVX2-ONLY-NEXT: vzeroupper
102 ; AVX2-ONLY-NEXT: retq
104 ; AVX512-LABEL: store_i64_stride2_vf4:
106 ; AVX512-NEXT: vmovaps (%rdi), %ymm0
107 ; AVX512-NEXT: vinsertf64x4 $1, (%rsi), %zmm0, %zmm0
108 ; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
109 ; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0
110 ; AVX512-NEXT: vmovaps %zmm0, (%rdx)
111 ; AVX512-NEXT: vzeroupper
113 %in.vec0 = load <4 x i64>, ptr %in.vecptr0, align 64
114 %in.vec1 = load <4 x i64>, ptr %in.vecptr1, align 64
115 %1 = shufflevector <4 x i64> %in.vec0, <4 x i64> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
116 %interleaved.vec = shufflevector <8 x i64> %1, <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
117 store <8 x i64> %interleaved.vec, ptr %out.vec, align 64
121 define void @store_i64_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
122 ; SSE-LABEL: store_i64_stride2_vf8:
124 ; SSE-NEXT: movaps (%rdi), %xmm0
125 ; SSE-NEXT: movaps 16(%rdi), %xmm1
126 ; SSE-NEXT: movaps 32(%rdi), %xmm2
127 ; SSE-NEXT: movaps 48(%rdi), %xmm3
128 ; SSE-NEXT: movaps (%rsi), %xmm4
129 ; SSE-NEXT: movaps 16(%rsi), %xmm5
130 ; SSE-NEXT: movaps 32(%rsi), %xmm6
131 ; SSE-NEXT: movaps 48(%rsi), %xmm7
132 ; SSE-NEXT: movaps %xmm0, %xmm8
133 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1]
134 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
135 ; SSE-NEXT: movaps %xmm1, %xmm4
136 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1]
137 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0]
138 ; SSE-NEXT: movaps %xmm2, %xmm5
139 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1]
140 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm6[0]
141 ; SSE-NEXT: movaps %xmm3, %xmm6
142 ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1]
143 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm7[0]
144 ; SSE-NEXT: movaps %xmm3, 96(%rdx)
145 ; SSE-NEXT: movaps %xmm6, 112(%rdx)
146 ; SSE-NEXT: movaps %xmm2, 64(%rdx)
147 ; SSE-NEXT: movaps %xmm5, 80(%rdx)
148 ; SSE-NEXT: movaps %xmm1, 32(%rdx)
149 ; SSE-NEXT: movaps %xmm4, 48(%rdx)
150 ; SSE-NEXT: movaps %xmm0, (%rdx)
151 ; SSE-NEXT: movaps %xmm8, 16(%rdx)
154 ; AVX1-ONLY-LABEL: store_i64_stride2_vf8:
155 ; AVX1-ONLY: # %bb.0:
156 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm0
157 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm1
158 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2
159 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm3
160 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm2[1],xmm0[1]
161 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0]
162 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
163 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm1[1]
164 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm1[0]
165 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
166 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3,2,3]
167 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3,2,3]
168 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[3],ymm2[3]
169 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3,2,3]
170 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = mem[2,3,2,3]
171 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[3],ymm3[3]
172 ; AVX1-ONLY-NEXT: vmovapd %ymm3, 96(%rdx)
173 ; AVX1-ONLY-NEXT: vmovapd %ymm2, 32(%rdx)
174 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx)
175 ; AVX1-ONLY-NEXT: vmovapd %ymm0, (%rdx)
176 ; AVX1-ONLY-NEXT: vzeroupper
177 ; AVX1-ONLY-NEXT: retq
179 ; AVX2-ONLY-LABEL: store_i64_stride2_vf8:
180 ; AVX2-ONLY: # %bb.0:
181 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0
182 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1
183 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm2
184 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm3
185 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[0,2,2,3]
186 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[2,1,3,3]
187 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
188 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
189 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
190 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
191 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[0,2,2,3]
192 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm1[2,1,3,3]
193 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
194 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1]
195 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
196 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7]
197 ; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx)
198 ; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rdx)
199 ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx)
200 ; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rdx)
201 ; AVX2-ONLY-NEXT: vzeroupper
202 ; AVX2-ONLY-NEXT: retq
204 ; AVX512-LABEL: store_i64_stride2_vf8:
206 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
207 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm1
208 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11]
209 ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
210 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15]
211 ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
212 ; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rdx)
213 ; AVX512-NEXT: vmovdqa64 %zmm2, (%rdx)
214 ; AVX512-NEXT: vzeroupper
216 %in.vec0 = load <8 x i64>, ptr %in.vecptr0, align 64
217 %in.vec1 = load <8 x i64>, ptr %in.vecptr1, align 64
218 %1 = shufflevector <8 x i64> %in.vec0, <8 x i64> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
219 %interleaved.vec = shufflevector <16 x i64> %1, <16 x i64> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
220 store <16 x i64> %interleaved.vec, ptr %out.vec, align 64
224 define void @store_i64_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
225 ; SSE-LABEL: store_i64_stride2_vf16:
227 ; SSE-NEXT: movaps 112(%rdi), %xmm0
228 ; SSE-NEXT: movaps 96(%rdi), %xmm6
229 ; SSE-NEXT: movaps 80(%rdi), %xmm4
230 ; SSE-NEXT: movaps 64(%rdi), %xmm3
231 ; SSE-NEXT: movaps (%rdi), %xmm8
232 ; SSE-NEXT: movaps 16(%rdi), %xmm1
233 ; SSE-NEXT: movaps 32(%rdi), %xmm2
234 ; SSE-NEXT: movaps 48(%rdi), %xmm5
235 ; SSE-NEXT: movaps 96(%rsi), %xmm11
236 ; SSE-NEXT: movaps 80(%rsi), %xmm12
237 ; SSE-NEXT: movaps 64(%rsi), %xmm13
238 ; SSE-NEXT: movaps (%rsi), %xmm9
239 ; SSE-NEXT: movaps 16(%rsi), %xmm10
240 ; SSE-NEXT: movaps 32(%rsi), %xmm14
241 ; SSE-NEXT: movaps 48(%rsi), %xmm15
242 ; SSE-NEXT: movaps %xmm8, %xmm7
243 ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1]
244 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
245 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm9[0]
246 ; SSE-NEXT: movaps %xmm1, %xmm9
247 ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1]
248 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm10[0]
249 ; SSE-NEXT: movaps %xmm2, %xmm10
250 ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm14[1]
251 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm14[0]
252 ; SSE-NEXT: movaps %xmm5, %xmm14
253 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm15[1]
254 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm15[0]
255 ; SSE-NEXT: movaps %xmm3, %xmm15
256 ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm13[1]
257 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm13[0]
258 ; SSE-NEXT: movaps %xmm4, %xmm13
259 ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm12[1]
260 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm12[0]
261 ; SSE-NEXT: movaps %xmm6, %xmm12
262 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1]
263 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm11[0]
264 ; SSE-NEXT: movaps 112(%rsi), %xmm11
265 ; SSE-NEXT: movaps %xmm0, %xmm7
266 ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm11[1]
267 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm11[0]
268 ; SSE-NEXT: movaps %xmm0, 224(%rdx)
269 ; SSE-NEXT: movaps %xmm7, 240(%rdx)
270 ; SSE-NEXT: movaps %xmm6, 192(%rdx)
271 ; SSE-NEXT: movaps %xmm12, 208(%rdx)
272 ; SSE-NEXT: movaps %xmm4, 160(%rdx)
273 ; SSE-NEXT: movaps %xmm13, 176(%rdx)
274 ; SSE-NEXT: movaps %xmm3, 128(%rdx)
275 ; SSE-NEXT: movaps %xmm15, 144(%rdx)
276 ; SSE-NEXT: movaps %xmm5, 96(%rdx)
277 ; SSE-NEXT: movaps %xmm14, 112(%rdx)
278 ; SSE-NEXT: movaps %xmm2, 64(%rdx)
279 ; SSE-NEXT: movaps %xmm10, 80(%rdx)
280 ; SSE-NEXT: movaps %xmm1, 32(%rdx)
281 ; SSE-NEXT: movaps %xmm9, 48(%rdx)
282 ; SSE-NEXT: movaps %xmm8, (%rdx)
283 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
284 ; SSE-NEXT: movaps %xmm0, 16(%rdx)
287 ; AVX1-ONLY-LABEL: store_i64_stride2_vf16:
288 ; AVX1-ONLY: # %bb.0:
289 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm0
290 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm1
291 ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm2
292 ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm3
293 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm4
294 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm5
295 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm6
296 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm7
297 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm7[1],xmm3[1]
298 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm7[0],xmm3[0]
299 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3
300 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm6[1],xmm2[1]
301 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm2[0]
302 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2
303 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm4[1],xmm0[1]
304 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm0[0]
305 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
306 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm1[1]
307 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm1[0]
308 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
309 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = mem[2,3,2,3]
310 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3,2,3]
311 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[3],ymm4[3]
312 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3,2,3]
313 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3,2,3]
314 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[3],ymm5[3]
315 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3,2,3]
316 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = mem[2,3,2,3]
317 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[3],ymm6[3]
318 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = mem[2,3,2,3]
319 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3,2,3]
320 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[3],ymm7[3]
321 ; AVX1-ONLY-NEXT: vmovapd %ymm7, 32(%rdx)
322 ; AVX1-ONLY-NEXT: vmovapd %ymm6, 96(%rdx)
323 ; AVX1-ONLY-NEXT: vmovapd %ymm5, 160(%rdx)
324 ; AVX1-ONLY-NEXT: vmovapd %ymm4, 224(%rdx)
325 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx)
326 ; AVX1-ONLY-NEXT: vmovapd %ymm0, (%rdx)
327 ; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rdx)
328 ; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%rdx)
329 ; AVX1-ONLY-NEXT: vzeroupper
330 ; AVX1-ONLY-NEXT: retq
332 ; AVX2-ONLY-LABEL: store_i64_stride2_vf16:
333 ; AVX2-ONLY: # %bb.0:
334 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0
335 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1
336 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2
337 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm3
338 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm4
339 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm5
340 ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm6
341 ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm7
342 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm4[0,2,2,3]
343 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[2,1,3,3]
344 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7]
345 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1]
346 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
347 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7]
348 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm5[0,2,2,3]
349 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm1[2,1,3,3]
350 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3],ymm9[4,5],ymm4[6,7]
351 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,0,2,1]
352 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
353 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3],ymm1[4,5],ymm5[6,7]
354 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm6[0,2,2,3]
355 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm2[2,1,3,3]
356 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7]
357 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1]
358 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,1,3]
359 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3],ymm2[4,5],ymm6[6,7]
360 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm7[0,2,2,3]
361 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm3[2,1,3,3]
362 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7]
363 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1]
364 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3]
365 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3],ymm3[4,5],ymm7[6,7]
366 ; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%rdx)
367 ; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rdx)
368 ; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rdx)
369 ; AVX2-ONLY-NEXT: vmovaps %ymm5, 160(%rdx)
370 ; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx)
371 ; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rdx)
372 ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx)
373 ; AVX2-ONLY-NEXT: vmovaps %ymm8, 32(%rdx)
374 ; AVX2-ONLY-NEXT: vzeroupper
375 ; AVX2-ONLY-NEXT: retq
377 ; AVX512-LABEL: store_i64_stride2_vf16:
379 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
380 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1
381 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm2
382 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm3
383 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15]
384 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5
385 ; AVX512-NEXT: vpermt2q %zmm2, %zmm4, %zmm5
386 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11]
387 ; AVX512-NEXT: vpermt2q %zmm2, %zmm6, %zmm0
388 ; AVX512-NEXT: vpermi2q %zmm3, %zmm1, %zmm4
389 ; AVX512-NEXT: vpermt2q %zmm3, %zmm6, %zmm1
390 ; AVX512-NEXT: vmovdqa64 %zmm1, 128(%rdx)
391 ; AVX512-NEXT: vmovdqa64 %zmm4, 192(%rdx)
392 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
393 ; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rdx)
394 ; AVX512-NEXT: vzeroupper
396 %in.vec0 = load <16 x i64>, ptr %in.vecptr0, align 64
397 %in.vec1 = load <16 x i64>, ptr %in.vecptr1, align 64
398 %1 = shufflevector <16 x i64> %in.vec0, <16 x i64> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
399 %interleaved.vec = shufflevector <32 x i64> %1, <32 x i64> poison, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
400 store <32 x i64> %interleaved.vec, ptr %out.vec, align 64
404 define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
405 ; SSE-LABEL: store_i64_stride2_vf32:
407 ; SSE-NEXT: subq $152, %rsp
408 ; SSE-NEXT: movaps 112(%rdi), %xmm14
409 ; SSE-NEXT: movaps 96(%rdi), %xmm13
410 ; SSE-NEXT: movaps 80(%rdi), %xmm11
411 ; SSE-NEXT: movaps 64(%rdi), %xmm10
412 ; SSE-NEXT: movaps (%rdi), %xmm7
413 ; SSE-NEXT: movaps 16(%rdi), %xmm8
414 ; SSE-NEXT: movaps 32(%rdi), %xmm9
415 ; SSE-NEXT: movaps 48(%rdi), %xmm12
416 ; SSE-NEXT: movaps 96(%rsi), %xmm0
417 ; SSE-NEXT: movaps 80(%rsi), %xmm1
418 ; SSE-NEXT: movaps 64(%rsi), %xmm2
419 ; SSE-NEXT: movaps (%rsi), %xmm3
420 ; SSE-NEXT: movaps 16(%rsi), %xmm4
421 ; SSE-NEXT: movaps 32(%rsi), %xmm5
422 ; SSE-NEXT: movaps 48(%rsi), %xmm6
423 ; SSE-NEXT: movaps %xmm7, %xmm15
424 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm3[0]
425 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
426 ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1]
427 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
428 ; SSE-NEXT: movaps %xmm8, %xmm7
429 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm4[0]
430 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
431 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1]
432 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
433 ; SSE-NEXT: movaps %xmm9, %xmm4
434 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
435 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
436 ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm5[1]
437 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
438 ; SSE-NEXT: movaps %xmm12, %xmm4
439 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0]
440 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
441 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm6[1]
442 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
443 ; SSE-NEXT: movaps %xmm10, %xmm3
444 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
445 ; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill
446 ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1]
447 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
448 ; SSE-NEXT: movaps %xmm11, %xmm2
449 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
450 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
451 ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1]
452 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
453 ; SSE-NEXT: movaps %xmm13, %xmm1
454 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
455 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
456 ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1]
457 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
458 ; SSE-NEXT: movaps 112(%rsi), %xmm0
459 ; SSE-NEXT: movaps %xmm14, %xmm1
460 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
461 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
462 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1]
463 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
464 ; SSE-NEXT: movaps 128(%rdi), %xmm15
465 ; SSE-NEXT: movaps 128(%rsi), %xmm0
466 ; SSE-NEXT: movaps %xmm15, %xmm1
467 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
468 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
469 ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1]
470 ; SSE-NEXT: movaps 144(%rdi), %xmm13
471 ; SSE-NEXT: movaps 144(%rsi), %xmm0
472 ; SSE-NEXT: movaps %xmm13, %xmm14
473 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0]
474 ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1]
475 ; SSE-NEXT: movaps 160(%rdi), %xmm10
476 ; SSE-NEXT: movaps 160(%rsi), %xmm0
477 ; SSE-NEXT: movaps %xmm10, %xmm12
478 ; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0]
479 ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1]
480 ; SSE-NEXT: movaps 176(%rdi), %xmm8
481 ; SSE-NEXT: movaps 176(%rsi), %xmm0
482 ; SSE-NEXT: movaps %xmm8, %xmm11
483 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0]
484 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1]
485 ; SSE-NEXT: movaps 192(%rdi), %xmm6
486 ; SSE-NEXT: movaps 192(%rsi), %xmm0
487 ; SSE-NEXT: movaps %xmm6, %xmm9
488 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0]
489 ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1]
490 ; SSE-NEXT: movaps 208(%rdi), %xmm5
491 ; SSE-NEXT: movaps 208(%rsi), %xmm1
492 ; SSE-NEXT: movaps %xmm5, %xmm7
493 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0]
494 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
495 ; SSE-NEXT: movaps 224(%rdi), %xmm1
496 ; SSE-NEXT: movaps 224(%rsi), %xmm3
497 ; SSE-NEXT: movaps %xmm1, %xmm2
498 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
499 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
500 ; SSE-NEXT: movaps 240(%rdi), %xmm3
501 ; SSE-NEXT: movaps 240(%rsi), %xmm4
502 ; SSE-NEXT: movaps %xmm3, %xmm0
503 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
504 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1]
505 ; SSE-NEXT: movaps %xmm3, 496(%rdx)
506 ; SSE-NEXT: movaps %xmm0, 480(%rdx)
507 ; SSE-NEXT: movaps %xmm1, 464(%rdx)
508 ; SSE-NEXT: movaps %xmm2, 448(%rdx)
509 ; SSE-NEXT: movaps %xmm5, 432(%rdx)
510 ; SSE-NEXT: movaps %xmm7, 416(%rdx)
511 ; SSE-NEXT: movaps %xmm6, 400(%rdx)
512 ; SSE-NEXT: movaps %xmm9, 384(%rdx)
513 ; SSE-NEXT: movaps %xmm8, 368(%rdx)
514 ; SSE-NEXT: movaps %xmm11, 352(%rdx)
515 ; SSE-NEXT: movaps %xmm10, 336(%rdx)
516 ; SSE-NEXT: movaps %xmm12, 320(%rdx)
517 ; SSE-NEXT: movaps %xmm13, 304(%rdx)
518 ; SSE-NEXT: movaps %xmm14, 288(%rdx)
519 ; SSE-NEXT: movaps %xmm15, 272(%rdx)
520 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
521 ; SSE-NEXT: movaps %xmm0, 256(%rdx)
522 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
523 ; SSE-NEXT: movaps %xmm0, 240(%rdx)
524 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
525 ; SSE-NEXT: movaps %xmm0, 224(%rdx)
526 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
527 ; SSE-NEXT: movaps %xmm0, 208(%rdx)
528 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
529 ; SSE-NEXT: movaps %xmm0, 192(%rdx)
530 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
531 ; SSE-NEXT: movaps %xmm0, 176(%rdx)
532 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
533 ; SSE-NEXT: movaps %xmm0, 160(%rdx)
534 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
535 ; SSE-NEXT: movaps %xmm0, 144(%rdx)
536 ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
537 ; SSE-NEXT: movaps %xmm0, 128(%rdx)
538 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
539 ; SSE-NEXT: movaps %xmm0, 112(%rdx)
540 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
541 ; SSE-NEXT: movaps %xmm0, 96(%rdx)
542 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
543 ; SSE-NEXT: movaps %xmm0, 80(%rdx)
544 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
545 ; SSE-NEXT: movaps %xmm0, 64(%rdx)
546 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
547 ; SSE-NEXT: movaps %xmm0, 48(%rdx)
548 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
549 ; SSE-NEXT: movaps %xmm0, 32(%rdx)
550 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
551 ; SSE-NEXT: movaps %xmm0, 16(%rdx)
552 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
553 ; SSE-NEXT: movaps %xmm0, (%rdx)
554 ; SSE-NEXT: addq $152, %rsp
557 ; AVX1-ONLY-LABEL: store_i64_stride2_vf32:
558 ; AVX1-ONLY: # %bb.0:
559 ; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm0
560 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1
561 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
562 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
563 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
564 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
565 ; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm1
566 ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm2
567 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm1[1]
568 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
569 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
570 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm2
571 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm3
572 ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm4
573 ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm5
574 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm6
575 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm7
576 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm8
577 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm9
578 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm6[1],xmm2[1]
579 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm2[0]
580 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm2
581 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm7[1],xmm3[1]
582 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm7[0],xmm3[0]
583 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3
584 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm8[1],xmm4[1]
585 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm4[0]
586 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
587 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm9[1],xmm5[1]
588 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm9[0],xmm5[0]
589 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
590 ; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm6
591 ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm7
592 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm7[1],xmm6[1]
593 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm7[0],xmm6[0]
594 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6
595 ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm7
596 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm8
597 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm7[1]
598 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm8[0],xmm7[0]
599 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7
600 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3,2,3]
601 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = mem[2,3,2,3]
602 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[3],ymm8[3]
603 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = mem[2,3,2,3]
604 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = mem[2,3,2,3]
605 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[3],ymm9[3]
606 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = mem[2,3,2,3]
607 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = mem[2,3,2,3]
608 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[3],ymm10[3]
609 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = mem[2,3,2,3]
610 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = mem[2,3,2,3]
611 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm12[0],ymm11[0],ymm12[3],ymm11[3]
612 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = mem[2,3,2,3]
613 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = mem[2,3,2,3]
614 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm13[0],ymm12[0],ymm13[3],ymm12[3]
615 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = mem[2,3,2,3]
616 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = mem[2,3,2,3]
617 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm14[0],ymm13[0],ymm14[3],ymm13[3]
618 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = mem[2,3,2,3]
619 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = mem[2,3,2,3]
620 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm15[0],ymm14[0],ymm15[3],ymm14[3]
621 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = mem[2,3,2,3]
622 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3]
623 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[3],ymm15[3]
624 ; AVX1-ONLY-NEXT: vmovapd %ymm0, 416(%rdx)
625 ; AVX1-ONLY-NEXT: vmovapd %ymm14, 352(%rdx)
626 ; AVX1-ONLY-NEXT: vmovapd %ymm13, 224(%rdx)
627 ; AVX1-ONLY-NEXT: vmovapd %ymm12, 32(%rdx)
628 ; AVX1-ONLY-NEXT: vmovapd %ymm11, 96(%rdx)
629 ; AVX1-ONLY-NEXT: vmovapd %ymm10, 160(%rdx)
630 ; AVX1-ONLY-NEXT: vmovapd %ymm9, 288(%rdx)
631 ; AVX1-ONLY-NEXT: vmovapd %ymm8, 480(%rdx)
632 ; AVX1-ONLY-NEXT: vmovaps %ymm7, 384(%rdx)
633 ; AVX1-ONLY-NEXT: vmovaps %ymm6, 320(%rdx)
634 ; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rdx)
635 ; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rdx)
636 ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rdx)
637 ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx)
638 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rdx)
639 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
640 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rdx)
641 ; AVX1-ONLY-NEXT: vzeroupper
642 ; AVX1-ONLY-NEXT: retq
644 ; AVX2-ONLY-LABEL: store_i64_stride2_vf32:
645 ; AVX2-ONLY: # %bb.0:
646 ; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm2
647 ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm5
648 ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm9
649 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1
650 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm4
651 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm8
652 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm12
653 ; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm6
654 ; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm10
655 ; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm13
656 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm3
657 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm7
658 ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm11
659 ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm14
660 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,2,2,3]
661 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm1[2,1,3,3]
662 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7]
663 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
664 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1]
665 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
666 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7]
667 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
668 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm7[0,2,2,3]
669 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm4[2,1,3,3]
670 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3],ymm15[4,5],ymm3[6,7]
671 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1]
672 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,1,3]
673 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5],ymm7[6,7]
674 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[0,2,2,3]
675 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm8[2,1,3,3]
676 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm7[2,3],ymm15[4,5],ymm7[6,7]
677 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,0,2,1]
678 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,1,3]
679 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm11[2,3],ymm8[4,5],ymm11[6,7]
680 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm14[0,2,2,3]
681 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm12[2,1,3,3]
682 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
683 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,0,2,1]
684 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,1,3]
685 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3],ymm12[4,5],ymm14[6,7]
686 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm13[0,2,2,3]
687 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm9[2,1,3,3]
688 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7]
689 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,0,2,1]
690 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,1,3]
691 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm13[2,3],ymm9[4,5],ymm13[6,7]
692 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm10[0,2,2,3]
693 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm5[2,1,3,3]
694 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5],ymm13[6,7]
695 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,0,2,1]
696 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3]
697 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm10[2,3],ymm5[4,5],ymm10[6,7]
698 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[0,2,2,3]
699 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm2[2,1,3,3]
700 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7]
701 ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm15
702 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1]
703 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,1,3]
704 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3],ymm2[4,5],ymm6[6,7]
705 ; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm6
706 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,2,3]
707 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm15[2,1,3,3]
708 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
709 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[0,0,2,1]
710 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm15[0,1,1,3]
711 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7]
712 ; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rdx)
713 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rdx)
714 ; AVX2-ONLY-NEXT: vmovaps %ymm2, 384(%rdx)
715 ; AVX2-ONLY-NEXT: vmovaps %ymm10, 416(%rdx)
716 ; AVX2-ONLY-NEXT: vmovaps %ymm5, 320(%rdx)
717 ; AVX2-ONLY-NEXT: vmovaps %ymm13, 352(%rdx)
718 ; AVX2-ONLY-NEXT: vmovaps %ymm9, 256(%rdx)
719 ; AVX2-ONLY-NEXT: vmovaps %ymm14, 288(%rdx)
720 ; AVX2-ONLY-NEXT: vmovaps %ymm12, 192(%rdx)
721 ; AVX2-ONLY-NEXT: vmovaps %ymm11, 224(%rdx)
722 ; AVX2-ONLY-NEXT: vmovaps %ymm8, 128(%rdx)
723 ; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%rdx)
724 ; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rdx)
725 ; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rdx)
726 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
727 ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx)
728 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
729 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx)
730 ; AVX2-ONLY-NEXT: vzeroupper
731 ; AVX2-ONLY-NEXT: retq
733 ; AVX512-LABEL: store_i64_stride2_vf32:
735 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
736 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1
737 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2
738 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3
739 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm4
740 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm5
741 ; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm6
742 ; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm7
743 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15]
744 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9
745 ; AVX512-NEXT: vpermt2q %zmm4, %zmm8, %zmm9
746 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11]
747 ; AVX512-NEXT: vpermt2q %zmm4, %zmm10, %zmm0
748 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4
749 ; AVX512-NEXT: vpermt2q %zmm5, %zmm8, %zmm4
750 ; AVX512-NEXT: vpermt2q %zmm5, %zmm10, %zmm1
751 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5
752 ; AVX512-NEXT: vpermt2q %zmm6, %zmm8, %zmm5
753 ; AVX512-NEXT: vpermt2q %zmm6, %zmm10, %zmm2
754 ; AVX512-NEXT: vpermi2q %zmm7, %zmm3, %zmm8
755 ; AVX512-NEXT: vpermt2q %zmm7, %zmm10, %zmm3
756 ; AVX512-NEXT: vmovdqa64 %zmm3, 384(%rdx)
757 ; AVX512-NEXT: vmovdqa64 %zmm8, 448(%rdx)
758 ; AVX512-NEXT: vmovdqa64 %zmm2, 256(%rdx)
759 ; AVX512-NEXT: vmovdqa64 %zmm5, 320(%rdx)
760 ; AVX512-NEXT: vmovdqa64 %zmm1, 128(%rdx)
761 ; AVX512-NEXT: vmovdqa64 %zmm4, 192(%rdx)
762 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
763 ; AVX512-NEXT: vmovdqa64 %zmm9, 64(%rdx)
764 ; AVX512-NEXT: vzeroupper
766 %in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64
767 %in.vec1 = load <32 x i64>, ptr %in.vecptr1, align 64
768 %1 = shufflevector <32 x i64> %in.vec0, <32 x i64> %in.vec1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
769 %interleaved.vec = shufflevector <64 x i64> %1, <64 x i64> poison, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
770 store <64 x i64> %interleaved.vec, ptr %out.vec, align 64
774 define void @store_i64_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
775 ; SSE-LABEL: store_i64_stride2_vf64:
777 ; SSE-NEXT: subq $664, %rsp # imm = 0x298
778 ; SSE-NEXT: movaps 112(%rdi), %xmm14
779 ; SSE-NEXT: movaps 96(%rdi), %xmm13
780 ; SSE-NEXT: movaps 80(%rdi), %xmm11
781 ; SSE-NEXT: movaps 64(%rdi), %xmm10
782 ; SSE-NEXT: movaps (%rdi), %xmm7
783 ; SSE-NEXT: movaps 16(%rdi), %xmm8
784 ; SSE-NEXT: movaps 32(%rdi), %xmm9
785 ; SSE-NEXT: movaps 48(%rdi), %xmm12
786 ; SSE-NEXT: movaps 96(%rsi), %xmm0
787 ; SSE-NEXT: movaps 80(%rsi), %xmm1
788 ; SSE-NEXT: movaps 64(%rsi), %xmm2
789 ; SSE-NEXT: movaps (%rsi), %xmm3
790 ; SSE-NEXT: movaps 16(%rsi), %xmm4
791 ; SSE-NEXT: movaps 32(%rsi), %xmm5
792 ; SSE-NEXT: movaps 48(%rsi), %xmm6
793 ; SSE-NEXT: movaps %xmm7, %xmm15
794 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm3[0]
795 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
796 ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1]
797 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
798 ; SSE-NEXT: movaps %xmm8, %xmm3
799 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
800 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
801 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1]
802 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
803 ; SSE-NEXT: movaps %xmm9, %xmm3
804 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm5[0]
805 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
806 ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm5[1]
807 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
808 ; SSE-NEXT: movaps %xmm12, %xmm3
809 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0]
810 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
811 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm6[1]
812 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
813 ; SSE-NEXT: movaps %xmm10, %xmm3
814 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
815 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
816 ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1]
817 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
818 ; SSE-NEXT: movaps %xmm11, %xmm2
819 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
820 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
821 ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1]
822 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
823 ; SSE-NEXT: movaps %xmm13, %xmm1
824 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
825 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
826 ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1]
827 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
828 ; SSE-NEXT: movaps 112(%rsi), %xmm0
829 ; SSE-NEXT: movaps %xmm14, %xmm1
830 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
831 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
832 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1]
833 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
834 ; SSE-NEXT: movaps 128(%rdi), %xmm1
835 ; SSE-NEXT: movaps 128(%rsi), %xmm0
836 ; SSE-NEXT: movaps %xmm1, %xmm2
837 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
838 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
839 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
840 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
841 ; SSE-NEXT: movaps 144(%rdi), %xmm1
842 ; SSE-NEXT: movaps 144(%rsi), %xmm0
843 ; SSE-NEXT: movaps %xmm1, %xmm2
844 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
845 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
846 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
847 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
848 ; SSE-NEXT: movaps 160(%rdi), %xmm1
849 ; SSE-NEXT: movaps 160(%rsi), %xmm0
850 ; SSE-NEXT: movaps %xmm1, %xmm2
851 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
852 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
853 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
854 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
855 ; SSE-NEXT: movaps 176(%rdi), %xmm1
856 ; SSE-NEXT: movaps 176(%rsi), %xmm0
857 ; SSE-NEXT: movaps %xmm1, %xmm2
858 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
859 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
860 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
861 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
862 ; SSE-NEXT: movaps 192(%rdi), %xmm1
863 ; SSE-NEXT: movaps 192(%rsi), %xmm0
864 ; SSE-NEXT: movaps %xmm1, %xmm2
865 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
866 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
867 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
868 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
869 ; SSE-NEXT: movaps 208(%rdi), %xmm1
870 ; SSE-NEXT: movaps 208(%rsi), %xmm0
871 ; SSE-NEXT: movaps %xmm1, %xmm2
872 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
873 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
874 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
875 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
876 ; SSE-NEXT: movaps 224(%rdi), %xmm1
877 ; SSE-NEXT: movaps 224(%rsi), %xmm0
878 ; SSE-NEXT: movaps %xmm1, %xmm2
879 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
880 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
881 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
882 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
883 ; SSE-NEXT: movaps 240(%rdi), %xmm1
884 ; SSE-NEXT: movaps 240(%rsi), %xmm0
885 ; SSE-NEXT: movaps %xmm1, %xmm2
886 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
887 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
888 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
889 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
890 ; SSE-NEXT: movaps 256(%rdi), %xmm1
891 ; SSE-NEXT: movaps 256(%rsi), %xmm0
892 ; SSE-NEXT: movaps %xmm1, %xmm2
893 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
894 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
895 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
896 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
897 ; SSE-NEXT: movaps 272(%rdi), %xmm1
898 ; SSE-NEXT: movaps 272(%rsi), %xmm0
899 ; SSE-NEXT: movaps %xmm1, %xmm2
900 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
901 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
902 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
903 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
904 ; SSE-NEXT: movaps 288(%rdi), %xmm1
905 ; SSE-NEXT: movaps 288(%rsi), %xmm0
906 ; SSE-NEXT: movaps %xmm1, %xmm2
907 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
908 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
909 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
910 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
911 ; SSE-NEXT: movaps 304(%rdi), %xmm1
912 ; SSE-NEXT: movaps 304(%rsi), %xmm0
913 ; SSE-NEXT: movaps %xmm1, %xmm2
914 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
915 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
916 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
917 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
918 ; SSE-NEXT: movaps 320(%rdi), %xmm1
919 ; SSE-NEXT: movaps 320(%rsi), %xmm0
920 ; SSE-NEXT: movaps %xmm1, %xmm2
921 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
922 ; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill
923 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
924 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
925 ; SSE-NEXT: movaps 336(%rdi), %xmm1
926 ; SSE-NEXT: movaps 336(%rsi), %xmm0
927 ; SSE-NEXT: movaps %xmm1, %xmm2
928 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
929 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
930 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
931 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
932 ; SSE-NEXT: movaps 352(%rdi), %xmm1
933 ; SSE-NEXT: movaps 352(%rsi), %xmm0
934 ; SSE-NEXT: movaps %xmm1, %xmm2
935 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
936 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
937 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
938 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
939 ; SSE-NEXT: movaps 368(%rdi), %xmm15
940 ; SSE-NEXT: movaps 368(%rsi), %xmm0
941 ; SSE-NEXT: movaps %xmm15, %xmm1
942 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
943 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
944 ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1]
945 ; SSE-NEXT: movaps 384(%rdi), %xmm13
946 ; SSE-NEXT: movaps 384(%rsi), %xmm0
947 ; SSE-NEXT: movaps %xmm13, %xmm1
948 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
949 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
950 ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1]
951 ; SSE-NEXT: movaps 400(%rdi), %xmm11
952 ; SSE-NEXT: movaps 400(%rsi), %xmm0
953 ; SSE-NEXT: movaps %xmm11, %xmm1
954 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
955 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
956 ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1]
957 ; SSE-NEXT: movaps 416(%rdi), %xmm12
958 ; SSE-NEXT: movaps 416(%rsi), %xmm0
959 ; SSE-NEXT: movaps %xmm12, %xmm14
960 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0]
961 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1]
962 ; SSE-NEXT: movaps 432(%rdi), %xmm8
963 ; SSE-NEXT: movaps 432(%rsi), %xmm0
964 ; SSE-NEXT: movaps %xmm8, %xmm10
965 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0]
966 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1]
967 ; SSE-NEXT: movaps 448(%rdi), %xmm6
968 ; SSE-NEXT: movaps 448(%rsi), %xmm0
969 ; SSE-NEXT: movaps %xmm6, %xmm9
970 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0]
971 ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1]
972 ; SSE-NEXT: movaps 464(%rdi), %xmm5
973 ; SSE-NEXT: movaps 464(%rsi), %xmm1
974 ; SSE-NEXT: movaps %xmm5, %xmm7
975 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0]
976 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
977 ; SSE-NEXT: movaps 480(%rdi), %xmm1
978 ; SSE-NEXT: movaps 480(%rsi), %xmm3
979 ; SSE-NEXT: movaps %xmm1, %xmm2
980 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
981 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
982 ; SSE-NEXT: movaps 496(%rdi), %xmm3
983 ; SSE-NEXT: movaps 496(%rsi), %xmm4
984 ; SSE-NEXT: movaps %xmm3, %xmm0
985 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
986 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1]
987 ; SSE-NEXT: movaps %xmm3, 1008(%rdx)
988 ; SSE-NEXT: movaps %xmm0, 992(%rdx)
989 ; SSE-NEXT: movaps %xmm1, 976(%rdx)
990 ; SSE-NEXT: movaps %xmm2, 960(%rdx)
991 ; SSE-NEXT: movaps %xmm5, 944(%rdx)
992 ; SSE-NEXT: movaps %xmm7, 928(%rdx)
993 ; SSE-NEXT: movaps %xmm6, 912(%rdx)
994 ; SSE-NEXT: movaps %xmm9, 896(%rdx)
995 ; SSE-NEXT: movaps %xmm8, 880(%rdx)
996 ; SSE-NEXT: movaps %xmm10, 864(%rdx)
997 ; SSE-NEXT: movaps %xmm12, 848(%rdx)
998 ; SSE-NEXT: movaps %xmm14, 832(%rdx)
999 ; SSE-NEXT: movaps %xmm11, 816(%rdx)
1000 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1001 ; SSE-NEXT: movaps %xmm0, 800(%rdx)
1002 ; SSE-NEXT: movaps %xmm13, 784(%rdx)
1003 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1004 ; SSE-NEXT: movaps %xmm0, 768(%rdx)
1005 ; SSE-NEXT: movaps %xmm15, 752(%rdx)
1006 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1007 ; SSE-NEXT: movaps %xmm0, 736(%rdx)
1008 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1009 ; SSE-NEXT: movaps %xmm0, 720(%rdx)
1010 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1011 ; SSE-NEXT: movaps %xmm0, 704(%rdx)
1012 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1013 ; SSE-NEXT: movaps %xmm0, 688(%rdx)
1014 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1015 ; SSE-NEXT: movaps %xmm0, 672(%rdx)
1016 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1017 ; SSE-NEXT: movaps %xmm0, 656(%rdx)
1018 ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
1019 ; SSE-NEXT: movaps %xmm0, 640(%rdx)
1020 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1021 ; SSE-NEXT: movaps %xmm0, 624(%rdx)
1022 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1023 ; SSE-NEXT: movaps %xmm0, 608(%rdx)
1024 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1025 ; SSE-NEXT: movaps %xmm0, 592(%rdx)
1026 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1027 ; SSE-NEXT: movaps %xmm0, 576(%rdx)
1028 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1029 ; SSE-NEXT: movaps %xmm0, 560(%rdx)
1030 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1031 ; SSE-NEXT: movaps %xmm0, 544(%rdx)
1032 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1033 ; SSE-NEXT: movaps %xmm0, 528(%rdx)
1034 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1035 ; SSE-NEXT: movaps %xmm0, 512(%rdx)
1036 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1037 ; SSE-NEXT: movaps %xmm0, 496(%rdx)
1038 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1039 ; SSE-NEXT: movaps %xmm0, 480(%rdx)
1040 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1041 ; SSE-NEXT: movaps %xmm0, 464(%rdx)
1042 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1043 ; SSE-NEXT: movaps %xmm0, 448(%rdx)
1044 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1045 ; SSE-NEXT: movaps %xmm0, 432(%rdx)
1046 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1047 ; SSE-NEXT: movaps %xmm0, 416(%rdx)
1048 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1049 ; SSE-NEXT: movaps %xmm0, 400(%rdx)
1050 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1051 ; SSE-NEXT: movaps %xmm0, 384(%rdx)
1052 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1053 ; SSE-NEXT: movaps %xmm0, 368(%rdx)
1054 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1055 ; SSE-NEXT: movaps %xmm0, 352(%rdx)
1056 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1057 ; SSE-NEXT: movaps %xmm0, 336(%rdx)
1058 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1059 ; SSE-NEXT: movaps %xmm0, 320(%rdx)
1060 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1061 ; SSE-NEXT: movaps %xmm0, 304(%rdx)
1062 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1063 ; SSE-NEXT: movaps %xmm0, 288(%rdx)
1064 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1065 ; SSE-NEXT: movaps %xmm0, 272(%rdx)
1066 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1067 ; SSE-NEXT: movaps %xmm0, 256(%rdx)
1068 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1069 ; SSE-NEXT: movaps %xmm0, 240(%rdx)
1070 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1071 ; SSE-NEXT: movaps %xmm0, 224(%rdx)
1072 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1073 ; SSE-NEXT: movaps %xmm0, 208(%rdx)
1074 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1075 ; SSE-NEXT: movaps %xmm0, 192(%rdx)
1076 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1077 ; SSE-NEXT: movaps %xmm0, 176(%rdx)
1078 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1079 ; SSE-NEXT: movaps %xmm0, 160(%rdx)
1080 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1081 ; SSE-NEXT: movaps %xmm0, 144(%rdx)
1082 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1083 ; SSE-NEXT: movaps %xmm0, 128(%rdx)
1084 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1085 ; SSE-NEXT: movaps %xmm0, 112(%rdx)
1086 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1087 ; SSE-NEXT: movaps %xmm0, 96(%rdx)
1088 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1089 ; SSE-NEXT: movaps %xmm0, 80(%rdx)
1090 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1091 ; SSE-NEXT: movaps %xmm0, 64(%rdx)
1092 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1093 ; SSE-NEXT: movaps %xmm0, 48(%rdx)
1094 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1095 ; SSE-NEXT: movaps %xmm0, 32(%rdx)
1096 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1097 ; SSE-NEXT: movaps %xmm0, 16(%rdx)
1098 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1099 ; SSE-NEXT: movaps %xmm0, (%rdx)
1100 ; SSE-NEXT: addq $664, %rsp # imm = 0x298
1103 ; AVX1-ONLY-LABEL: store_i64_stride2_vf64:
1104 ; AVX1-ONLY: # %bb.0:
1105 ; AVX1-ONLY-NEXT: subq $424, %rsp # imm = 0x1A8
1106 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm0
1107 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm1
1108 ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm2
1109 ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm3
1110 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm4
1111 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm5
1112 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm6
1113 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm7
1114 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm4[1],xmm0[1]
1115 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm0[0]
1116 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0
1117 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1118 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm1[1]
1119 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm1[0]
1120 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1121 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1122 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm6[1],xmm2[1]
1123 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm2[0]
1124 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1125 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1126 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm3[1]
1127 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm3[0]
1128 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1129 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1130 ; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm0
1131 ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1
1132 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
1133 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1134 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1135 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1136 ; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm0
1137 ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1
1138 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
1139 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1140 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1141 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1142 ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm0
1143 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1
1144 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
1145 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1146 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1147 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1148 ; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm0
1149 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1
1150 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
1151 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1152 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1153 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1154 ; AVX1-ONLY-NEXT: vmovaps 256(%rsi), %xmm0
1155 ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm1
1156 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
1157 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1158 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1159 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1160 ; AVX1-ONLY-NEXT: vmovaps 288(%rsi), %xmm0
1161 ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1
1162 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
1163 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1164 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1165 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1166 ; AVX1-ONLY-NEXT: vmovaps 320(%rsi), %xmm0
1167 ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm1
1168 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
1169 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1170 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1171 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1172 ; AVX1-ONLY-NEXT: vmovaps 352(%rsi), %xmm0
1173 ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1
1174 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
1175 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1176 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1177 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1178 ; AVX1-ONLY-NEXT: vmovaps 384(%rsi), %xmm0
1179 ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm1
1180 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
1181 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1182 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1183 ; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
1184 ; AVX1-ONLY-NEXT: vmovaps 416(%rsi), %xmm0
1185 ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1
1186 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
1187 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1188 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1189 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1190 ; AVX1-ONLY-NEXT: vmovaps 448(%rsi), %xmm0
1191 ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm1
1192 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
1193 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1194 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1195 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1196 ; AVX1-ONLY-NEXT: vmovaps 480(%rsi), %xmm0
1197 ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm1
1198 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
1199 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1200 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1201 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1202 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3]
1203 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3]
1204 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
1205 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1206 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3]
1207 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3]
1208 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
1209 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3]
1210 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3]
1211 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
1212 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3]
1213 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3]
1214 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
1215 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3]
1216 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3]
1217 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
1218 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3]
1219 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3]
1220 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
1221 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3]
1222 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3]
1223 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
1224 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3]
1225 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3,2,3]
1226 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[3],ymm0[3]
1227 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3,2,3]
1228 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = mem[2,3,2,3]
1229 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[3],ymm8[3]
1230 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = mem[2,3,2,3]
1231 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = mem[2,3,2,3]
1232 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[3],ymm9[3]
1233 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = mem[2,3,2,3]
1234 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = mem[2,3,2,3]
1235 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[3],ymm10[3]
1236 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = mem[2,3,2,3]
1237 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = mem[2,3,2,3]
1238 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm12[0],ymm11[0],ymm12[3],ymm11[3]
1239 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = mem[2,3,2,3]
1240 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = mem[2,3,2,3]
1241 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm13[0],ymm12[0],ymm13[3],ymm12[3]
1242 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = mem[2,3,2,3]
1243 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = mem[2,3,2,3]
1244 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm14[0],ymm13[0],ymm14[3],ymm13[3]
1245 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = mem[2,3,2,3]
1246 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = mem[2,3,2,3]
1247 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm15[0],ymm14[0],ymm15[3],ymm14[3]
1248 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = mem[2,3,2,3]
1249 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = mem[2,3,2,3]
1250 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm15[0],ymm7[3],ymm15[3]
1251 ; AVX1-ONLY-NEXT: vmovapd %ymm7, 992(%rdx)
1252 ; AVX1-ONLY-NEXT: vmovapd %ymm14, 928(%rdx)
1253 ; AVX1-ONLY-NEXT: vmovapd %ymm13, 864(%rdx)
1254 ; AVX1-ONLY-NEXT: vmovapd %ymm12, 800(%rdx)
1255 ; AVX1-ONLY-NEXT: vmovapd %ymm11, 736(%rdx)
1256 ; AVX1-ONLY-NEXT: vmovapd %ymm10, 672(%rdx)
1257 ; AVX1-ONLY-NEXT: vmovapd %ymm9, 608(%rdx)
1258 ; AVX1-ONLY-NEXT: vmovapd %ymm8, 544(%rdx)
1259 ; AVX1-ONLY-NEXT: vmovapd %ymm0, 480(%rdx)
1260 ; AVX1-ONLY-NEXT: vmovapd %ymm1, 416(%rdx)
1261 ; AVX1-ONLY-NEXT: vmovapd %ymm2, 352(%rdx)
1262 ; AVX1-ONLY-NEXT: vmovapd %ymm3, 288(%rdx)
1263 ; AVX1-ONLY-NEXT: vmovapd %ymm4, 224(%rdx)
1264 ; AVX1-ONLY-NEXT: vmovapd %ymm5, 160(%rdx)
1265 ; AVX1-ONLY-NEXT: vmovapd %ymm6, 96(%rdx)
1266 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1267 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx)
1268 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1269 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 960(%rdx)
1270 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1271 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 896(%rdx)
1272 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1273 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 832(%rdx)
1274 ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
1275 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 768(%rdx)
1276 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1277 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 704(%rdx)
1278 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1279 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 640(%rdx)
1280 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1281 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 576(%rdx)
1282 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1283 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%rdx)
1284 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1285 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rdx)
1286 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1287 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rdx)
1288 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1289 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rdx)
1290 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1291 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rdx)
1292 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1293 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rdx)
1294 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1295 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rdx)
1296 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1297 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rdx)
1298 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1299 ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx)
1300 ; AVX1-ONLY-NEXT: addq $424, %rsp # imm = 0x1A8
1301 ; AVX1-ONLY-NEXT: vzeroupper
1302 ; AVX1-ONLY-NEXT: retq
1304 ; AVX2-ONLY-LABEL: store_i64_stride2_vf64:
1305 ; AVX2-ONLY: # %bb.0:
1306 ; AVX2-ONLY-NEXT: subq $456, %rsp # imm = 0x1C8
1307 ; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm0
1308 ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm1
1309 ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm2
1310 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm7
1311 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm8
1312 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm6
1313 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm4
1314 ; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm3
1315 ; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm5
1316 ; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm9
1317 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm10
1318 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm11
1319 ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm12
1320 ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm13
1321 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm10[0,0,2,1]
1322 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm7[0,1,1,3]
1323 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7]
1324 ; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1325 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,2,3]
1326 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,3,3]
1327 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7]
1328 ; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1329 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[0,0,2,1]
1330 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm8[0,1,1,3]
1331 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3],ymm10[4,5],ymm7[6,7]
1332 ; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1333 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[0,2,2,3]
1334 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,3,3]
1335 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7]
1336 ; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1337 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm12[0,0,2,1]
1338 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm6[0,1,1,3]
1339 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7]
1340 ; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1341 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm12[0,2,2,3]
1342 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3]
1343 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7]
1344 ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1345 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm13[0,0,2,1]
1346 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm4[0,1,1,3]
1347 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7]
1348 ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1349 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm13[0,2,2,3]
1350 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3]
1351 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7]
1352 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1353 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm9[0,0,2,1]
1354 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm2[0,1,1,3]
1355 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7]
1356 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1357 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm9[0,2,2,3]
1358 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3]
1359 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5],ymm4[6,7]
1360 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1361 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm5[0,0,2,1]
1362 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[0,1,1,3]
1363 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
1364 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1365 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm5[0,2,2,3]
1366 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3]
1367 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
1368 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1369 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,0,2,1]
1370 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,1,3]
1371 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
1372 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1373 ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1
1374 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[0,2,2,3]
1375 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
1376 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
1377 ; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
1378 ; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm0
1379 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,0,2,1]
1380 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[0,1,1,3]
1381 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
1382 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1383 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
1384 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3]
1385 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
1386 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1387 ; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm0
1388 ; AVX2-ONLY-NEXT: vmovaps 256(%rsi), %ymm1
1389 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
1390 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3]
1391 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
1392 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1393 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
1394 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
1395 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
1396 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1397 ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm0
1398 ; AVX2-ONLY-NEXT: vmovaps 288(%rsi), %ymm1
1399 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
1400 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3]
1401 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
1402 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
1403 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
1404 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
1405 ; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm0
1406 ; AVX2-ONLY-NEXT: vmovaps 320(%rsi), %ymm1
1407 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
1408 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3]
1409 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
1410 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
1411 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
1412 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
1413 ; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm0
1414 ; AVX2-ONLY-NEXT: vmovaps 352(%rsi), %ymm1
1415 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
1416 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[0,1,1,3]
1417 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7]
1418 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
1419 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
1420 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
1421 ; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm9
1422 ; AVX2-ONLY-NEXT: vmovaps 384(%rsi), %ymm0
1423 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,2,1]
1424 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm9[0,1,1,3]
1425 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7]
1426 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
1427 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3]
1428 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm0[2,3],ymm9[4,5],ymm0[6,7]
1429 ; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm9
1430 ; AVX2-ONLY-NEXT: vmovaps 416(%rsi), %ymm11
1431 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm11[0,0,2,1]
1432 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm9[0,1,1,3]
1433 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1],ymm0[2,3],ymm13[4,5],ymm0[6,7]
1434 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,2,3]
1435 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3]
1436 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm11[2,3],ymm9[4,5],ymm11[6,7]
1437 ; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm11
1438 ; AVX2-ONLY-NEXT: vmovaps 448(%rsi), %ymm13
1439 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,0,2,1]
1440 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm11[0,1,1,3]
1441 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7]
1442 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,2,3]
1443 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,3,3]
1444 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3],ymm11[4,5],ymm13[6,7]
1445 ; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm13
1446 ; AVX2-ONLY-NEXT: vmovaps 480(%rsi), %ymm15
1447 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm15[0,0,2,1]
1448 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm13[0,1,1,3]
1449 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3],ymm14[4,5],ymm0[6,7]
1450 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm15[0,2,2,3]
1451 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,3,3]
1452 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7]
1453 ; AVX2-ONLY-NEXT: vmovaps %ymm13, 992(%rdx)
1454 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 960(%rdx)
1455 ; AVX2-ONLY-NEXT: vmovaps %ymm11, 928(%rdx)
1456 ; AVX2-ONLY-NEXT: vmovaps %ymm1, 896(%rdx)
1457 ; AVX2-ONLY-NEXT: vmovaps %ymm9, 864(%rdx)
1458 ; AVX2-ONLY-NEXT: vmovaps %ymm2, 832(%rdx)
1459 ; AVX2-ONLY-NEXT: vmovaps %ymm3, 800(%rdx)
1460 ; AVX2-ONLY-NEXT: vmovaps %ymm4, 768(%rdx)
1461 ; AVX2-ONLY-NEXT: vmovaps %ymm5, 736(%rdx)
1462 ; AVX2-ONLY-NEXT: vmovaps %ymm6, 704(%rdx)
1463 ; AVX2-ONLY-NEXT: vmovaps %ymm7, 672(%rdx)
1464 ; AVX2-ONLY-NEXT: vmovaps %ymm8, 640(%rdx)
1465 ; AVX2-ONLY-NEXT: vmovaps %ymm10, 608(%rdx)
1466 ; AVX2-ONLY-NEXT: vmovaps %ymm12, 576(%rdx)
1467 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1468 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 544(%rdx)
1469 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1470 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 512(%rdx)
1471 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1472 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rdx)
1473 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1474 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rdx)
1475 ; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
1476 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%rdx)
1477 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1478 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rdx)
1479 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1480 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%rdx)
1481 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1482 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%rdx)
1483 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1484 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rdx)
1485 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1486 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%rdx)
1487 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1488 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rdx)
1489 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1490 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rdx)
1491 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1492 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rdx)
1493 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1494 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rdx)
1495 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1496 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rdx)
1497 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1498 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rdx)
1499 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1500 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx)
1501 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1502 ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx)
1503 ; AVX2-ONLY-NEXT: addq $456, %rsp # imm = 0x1C8
1504 ; AVX2-ONLY-NEXT: vzeroupper
1505 ; AVX2-ONLY-NEXT: retq
1507 ; AVX512-LABEL: store_i64_stride2_vf64:
1509 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm0
1510 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm1
1511 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm2
1512 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm3
1513 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm4
1514 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm5
1515 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm6
1516 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm7
1517 ; AVX512-NEXT: vmovdqa64 448(%rsi), %zmm8
1518 ; AVX512-NEXT: vmovdqa64 384(%rsi), %zmm9
1519 ; AVX512-NEXT: vmovdqa64 320(%rsi), %zmm10
1520 ; AVX512-NEXT: vmovdqa64 256(%rsi), %zmm11
1521 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm12
1522 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm13
1523 ; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm14
1524 ; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm15
1525 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15]
1526 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm17
1527 ; AVX512-NEXT: vpermt2q %zmm12, %zmm16, %zmm17
1528 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11]
1529 ; AVX512-NEXT: vpermt2q %zmm12, %zmm18, %zmm4
1530 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm12
1531 ; AVX512-NEXT: vpermt2q %zmm13, %zmm16, %zmm12
1532 ; AVX512-NEXT: vpermt2q %zmm13, %zmm18, %zmm5
1533 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm13
1534 ; AVX512-NEXT: vpermt2q %zmm14, %zmm16, %zmm13
1535 ; AVX512-NEXT: vpermt2q %zmm14, %zmm18, %zmm6
1536 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm14
1537 ; AVX512-NEXT: vpermt2q %zmm15, %zmm16, %zmm14
1538 ; AVX512-NEXT: vpermt2q %zmm15, %zmm18, %zmm7
1539 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm15
1540 ; AVX512-NEXT: vpermt2q %zmm11, %zmm16, %zmm15
1541 ; AVX512-NEXT: vpermt2q %zmm11, %zmm18, %zmm3
1542 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm11
1543 ; AVX512-NEXT: vpermt2q %zmm10, %zmm16, %zmm11
1544 ; AVX512-NEXT: vpermt2q %zmm10, %zmm18, %zmm2
1545 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10
1546 ; AVX512-NEXT: vpermt2q %zmm9, %zmm16, %zmm10
1547 ; AVX512-NEXT: vpermt2q %zmm9, %zmm18, %zmm1
1548 ; AVX512-NEXT: vpermi2q %zmm8, %zmm0, %zmm16
1549 ; AVX512-NEXT: vpermt2q %zmm8, %zmm18, %zmm0
1550 ; AVX512-NEXT: vmovdqa64 %zmm0, 896(%rdx)
1551 ; AVX512-NEXT: vmovdqa64 %zmm16, 960(%rdx)
1552 ; AVX512-NEXT: vmovdqa64 %zmm1, 768(%rdx)
1553 ; AVX512-NEXT: vmovdqa64 %zmm10, 832(%rdx)
1554 ; AVX512-NEXT: vmovdqa64 %zmm2, 640(%rdx)
1555 ; AVX512-NEXT: vmovdqa64 %zmm11, 704(%rdx)
1556 ; AVX512-NEXT: vmovdqa64 %zmm3, 512(%rdx)
1557 ; AVX512-NEXT: vmovdqa64 %zmm15, 576(%rdx)
1558 ; AVX512-NEXT: vmovdqa64 %zmm7, 384(%rdx)
1559 ; AVX512-NEXT: vmovdqa64 %zmm14, 448(%rdx)
1560 ; AVX512-NEXT: vmovdqa64 %zmm6, 256(%rdx)
1561 ; AVX512-NEXT: vmovdqa64 %zmm13, 320(%rdx)
1562 ; AVX512-NEXT: vmovdqa64 %zmm5, 128(%rdx)
1563 ; AVX512-NEXT: vmovdqa64 %zmm12, 192(%rdx)
1564 ; AVX512-NEXT: vmovdqa64 %zmm4, (%rdx)
1565 ; AVX512-NEXT: vmovdqa64 %zmm17, 64(%rdx)
1566 ; AVX512-NEXT: vzeroupper
1568 %in.vec0 = load <64 x i64>, ptr %in.vecptr0, align 64
1569 %in.vec1 = load <64 x i64>, ptr %in.vecptr1, align 64
1570 %1 = shufflevector <64 x i64> %in.vec0, <64 x i64> %in.vec1, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
1571 %interleaved.vec = shufflevector <128 x i64> %1, <128 x i64> poison, <128 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
1572 store <128 x i64> %interleaved.vec, ptr %out.vec, align 64
1575 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
1579 ; AVX2-FAST-PERLANE: {{.*}}
1581 ; AVX512-FAST: {{.*}}
1582 ; AVX512-SLOW: {{.*}}
1584 ; AVX512BW-FAST: {{.*}}
1585 ; AVX512BW-ONLY-FAST: {{.*}}
1586 ; AVX512BW-ONLY-SLOW: {{.*}}
1587 ; AVX512BW-SLOW: {{.*}}
1588 ; AVX512DQ-FAST: {{.*}}
1589 ; AVX512DQ-SLOW: {{.*}}
1590 ; AVX512DQBW-FAST: {{.*}}
1591 ; AVX512DQBW-SLOW: {{.*}}
1593 ; AVX512F-FAST: {{.*}}
1594 ; AVX512F-ONLY-FAST: {{.*}}
1595 ; AVX512F-ONLY-SLOW: {{.*}}
1596 ; AVX512F-SLOW: {{.*}}
1599 ; FALLBACK10: {{.*}}
1600 ; FALLBACK11: {{.*}}
1601 ; FALLBACK12: {{.*}}