1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,FALLBACK0
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-ONLY,FALLBACK1
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-SLOW,FALLBACK2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST,FALLBACK3
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST-PERLANE,FALLBACK4
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512F-ONLY-SLOW,FALLBACK5
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512F-ONLY-FAST,FALLBACK6
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512DQ-SLOW,FALLBACK7
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512DQ-FAST,FALLBACK8
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-SLOW,AVX512BW-SLOW,AVX512BW-ONLY-SLOW,FALLBACK9
12 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512BW-ONLY-FAST,FALLBACK10
13 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-SLOW,AVX512BW-SLOW,AVX512DQBW-SLOW,FALLBACK11
14 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512DQBW-FAST,FALLBACK12
16 ; These patterns are produced by LoopVectorizer for interleaved stores.
18 define void @store_i32_stride2_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
19 ; SSE-LABEL: store_i32_stride2_vf2:
21 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
22 ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
23 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
24 ; SSE-NEXT: movaps %xmm1, (%rdx)
27 ; AVX-LABEL: store_i32_stride2_vf2:
29 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
30 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
31 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
32 ; AVX-NEXT: vmovaps %xmm0, (%rdx)
34 %in.vec0 = load <2 x i32>, ptr %in.vecptr0, align 64
35 %in.vec1 = load <2 x i32>, ptr %in.vecptr1, align 64
36 %1 = shufflevector <2 x i32> %in.vec0, <2 x i32> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
37 %interleaved.vec = shufflevector <4 x i32> %1, <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
38 store <4 x i32> %interleaved.vec, ptr %out.vec, align 64
42 define void @store_i32_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
43 ; SSE-LABEL: store_i32_stride2_vf4:
45 ; SSE-NEXT: movaps (%rdi), %xmm0
46 ; SSE-NEXT: movaps (%rsi), %xmm1
47 ; SSE-NEXT: movaps %xmm0, %xmm2
48 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
49 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
50 ; SSE-NEXT: movaps %xmm0, 16(%rdx)
51 ; SSE-NEXT: movaps %xmm2, (%rdx)
54 ; AVX1-ONLY-LABEL: store_i32_stride2_vf4:
56 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0
57 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1
58 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
59 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
60 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5]
61 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,1,3,4,6,5,7]
62 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7]
63 ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx)
64 ; AVX1-ONLY-NEXT: vzeroupper
65 ; AVX1-ONLY-NEXT: retq
67 ; AVX2-LABEL: store_i32_stride2_vf4:
69 ; AVX2-NEXT: vmovaps (%rdi), %xmm0
70 ; AVX2-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
71 ; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
72 ; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
73 ; AVX2-NEXT: vmovaps %ymm0, (%rdx)
74 ; AVX2-NEXT: vzeroupper
76 %in.vec0 = load <4 x i32>, ptr %in.vecptr0, align 64
77 %in.vec1 = load <4 x i32>, ptr %in.vecptr1, align 64
78 %1 = shufflevector <4 x i32> %in.vec0, <4 x i32> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
79 %interleaved.vec = shufflevector <8 x i32> %1, <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
80 store <8 x i32> %interleaved.vec, ptr %out.vec, align 64
84 define void @store_i32_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
85 ; SSE-LABEL: store_i32_stride2_vf8:
87 ; SSE-NEXT: movaps (%rdi), %xmm0
88 ; SSE-NEXT: movaps 16(%rdi), %xmm1
89 ; SSE-NEXT: movaps (%rsi), %xmm2
90 ; SSE-NEXT: movaps 16(%rsi), %xmm3
91 ; SSE-NEXT: movaps %xmm0, %xmm4
92 ; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3]
93 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
94 ; SSE-NEXT: movaps %xmm1, %xmm2
95 ; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
96 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
97 ; SSE-NEXT: movaps %xmm1, 32(%rdx)
98 ; SSE-NEXT: movaps %xmm2, 48(%rdx)
99 ; SSE-NEXT: movaps %xmm0, (%rdx)
100 ; SSE-NEXT: movaps %xmm4, 16(%rdx)
103 ; AVX1-ONLY-LABEL: store_i32_stride2_vf8:
104 ; AVX1-ONLY: # %bb.0:
105 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm0
106 ; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm1
107 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2
108 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm3
109 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
110 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
111 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
112 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
113 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
114 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
115 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx)
116 ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx)
117 ; AVX1-ONLY-NEXT: vzeroupper
118 ; AVX1-ONLY-NEXT: retq
120 ; AVX2-ONLY-LABEL: store_i32_stride2_vf8:
121 ; AVX2-ONLY: # %bb.0:
122 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0
123 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm1
124 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
125 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
126 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[0,1],ymm2[0,1]
127 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
128 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx)
129 ; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx)
130 ; AVX2-ONLY-NEXT: vzeroupper
131 ; AVX2-ONLY-NEXT: retq
133 ; AVX512-LABEL: store_i32_stride2_vf8:
135 ; AVX512-NEXT: vmovaps (%rdi), %ymm0
136 ; AVX512-NEXT: vinsertf64x4 $1, (%rsi), %zmm0, %zmm0
137 ; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
138 ; AVX512-NEXT: vpermps %zmm0, %zmm1, %zmm0
139 ; AVX512-NEXT: vmovaps %zmm0, (%rdx)
140 ; AVX512-NEXT: vzeroupper
142 %in.vec0 = load <8 x i32>, ptr %in.vecptr0, align 64
143 %in.vec1 = load <8 x i32>, ptr %in.vecptr1, align 64
144 %1 = shufflevector <8 x i32> %in.vec0, <8 x i32> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
145 %interleaved.vec = shufflevector <16 x i32> %1, <16 x i32> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
146 store <16 x i32> %interleaved.vec, ptr %out.vec, align 64
150 define void @store_i32_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
151 ; SSE-LABEL: store_i32_stride2_vf16:
153 ; SSE-NEXT: movaps (%rdi), %xmm0
154 ; SSE-NEXT: movaps 16(%rdi), %xmm1
155 ; SSE-NEXT: movaps 32(%rdi), %xmm2
156 ; SSE-NEXT: movaps 48(%rdi), %xmm3
157 ; SSE-NEXT: movaps (%rsi), %xmm4
158 ; SSE-NEXT: movaps 16(%rsi), %xmm5
159 ; SSE-NEXT: movaps 32(%rsi), %xmm6
160 ; SSE-NEXT: movaps 48(%rsi), %xmm7
161 ; SSE-NEXT: movaps %xmm0, %xmm8
162 ; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm4[2],xmm8[3],xmm4[3]
163 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
164 ; SSE-NEXT: movaps %xmm1, %xmm4
165 ; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
166 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
167 ; SSE-NEXT: movaps %xmm2, %xmm5
168 ; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3]
169 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
170 ; SSE-NEXT: movaps %xmm3, %xmm6
171 ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
172 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1]
173 ; SSE-NEXT: movaps %xmm3, 96(%rdx)
174 ; SSE-NEXT: movaps %xmm6, 112(%rdx)
175 ; SSE-NEXT: movaps %xmm2, 64(%rdx)
176 ; SSE-NEXT: movaps %xmm5, 80(%rdx)
177 ; SSE-NEXT: movaps %xmm1, 32(%rdx)
178 ; SSE-NEXT: movaps %xmm4, 48(%rdx)
179 ; SSE-NEXT: movaps %xmm0, (%rdx)
180 ; SSE-NEXT: movaps %xmm8, 16(%rdx)
183 ; AVX1-ONLY-LABEL: store_i32_stride2_vf16:
184 ; AVX1-ONLY: # %bb.0:
185 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm0
186 ; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm1
187 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm2
188 ; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm3
189 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm4
190 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm5
191 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm6
192 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm7
193 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm5[2],xmm1[2],xmm5[3],xmm1[3]
194 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
195 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1
196 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
197 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
198 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
199 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm2[2],xmm6[3],xmm2[3]
200 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
201 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
202 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm3[2],xmm7[3],xmm3[3]
203 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
204 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
205 ; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx)
206 ; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx)
207 ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx)
208 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx)
209 ; AVX1-ONLY-NEXT: vzeroupper
210 ; AVX1-ONLY-NEXT: retq
212 ; AVX2-ONLY-LABEL: store_i32_stride2_vf16:
213 ; AVX2-ONLY: # %bb.0:
214 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0
215 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1
216 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm2
217 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm3
218 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7]
219 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[4],ymm2[4],ymm0[5],ymm2[5]
220 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm4[2,3]
221 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm4[0,1]
222 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7]
223 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5]
224 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm4[2,3]
225 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1]
226 ; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx)
227 ; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rdx)
228 ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx)
229 ; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx)
230 ; AVX2-ONLY-NEXT: vzeroupper
231 ; AVX2-ONLY-NEXT: retq
233 ; AVX512-LABEL: store_i32_stride2_vf16:
235 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
236 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm1
237 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23]
238 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
239 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31]
240 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
241 ; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rdx)
242 ; AVX512-NEXT: vmovdqa64 %zmm2, (%rdx)
243 ; AVX512-NEXT: vzeroupper
245 %in.vec0 = load <16 x i32>, ptr %in.vecptr0, align 64
246 %in.vec1 = load <16 x i32>, ptr %in.vecptr1, align 64
247 %1 = shufflevector <16 x i32> %in.vec0, <16 x i32> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
248 %interleaved.vec = shufflevector <32 x i32> %1, <32 x i32> poison, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
249 store <32 x i32> %interleaved.vec, ptr %out.vec, align 64
253 define void @store_i32_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
254 ; SSE-LABEL: store_i32_stride2_vf32:
256 ; SSE-NEXT: movaps 112(%rdi), %xmm0
257 ; SSE-NEXT: movaps 96(%rdi), %xmm6
258 ; SSE-NEXT: movaps 80(%rdi), %xmm4
259 ; SSE-NEXT: movaps 64(%rdi), %xmm3
260 ; SSE-NEXT: movaps (%rdi), %xmm8
261 ; SSE-NEXT: movaps 16(%rdi), %xmm1
262 ; SSE-NEXT: movaps 32(%rdi), %xmm2
263 ; SSE-NEXT: movaps 48(%rdi), %xmm5
264 ; SSE-NEXT: movaps 96(%rsi), %xmm11
265 ; SSE-NEXT: movaps 80(%rsi), %xmm12
266 ; SSE-NEXT: movaps 64(%rsi), %xmm13
267 ; SSE-NEXT: movaps (%rsi), %xmm9
268 ; SSE-NEXT: movaps 16(%rsi), %xmm10
269 ; SSE-NEXT: movaps 32(%rsi), %xmm14
270 ; SSE-NEXT: movaps 48(%rsi), %xmm15
271 ; SSE-NEXT: movaps %xmm8, %xmm7
272 ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm9[2],xmm7[3],xmm9[3]
273 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
274 ; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
275 ; SSE-NEXT: movaps %xmm1, %xmm9
276 ; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm10[2],xmm9[3],xmm10[3]
277 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1]
278 ; SSE-NEXT: movaps %xmm2, %xmm10
279 ; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm14[2],xmm10[3],xmm14[3]
280 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1]
281 ; SSE-NEXT: movaps %xmm5, %xmm14
282 ; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm15[2],xmm14[3],xmm15[3]
283 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1]
284 ; SSE-NEXT: movaps %xmm3, %xmm15
285 ; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm13[2],xmm15[3],xmm13[3]
286 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1]
287 ; SSE-NEXT: movaps %xmm4, %xmm13
288 ; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3]
289 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1]
290 ; SSE-NEXT: movaps %xmm6, %xmm12
291 ; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm11[2],xmm12[3],xmm11[3]
292 ; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1]
293 ; SSE-NEXT: movaps 112(%rsi), %xmm11
294 ; SSE-NEXT: movaps %xmm0, %xmm7
295 ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm11[2],xmm7[3],xmm11[3]
296 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
297 ; SSE-NEXT: movaps %xmm0, 224(%rdx)
298 ; SSE-NEXT: movaps %xmm7, 240(%rdx)
299 ; SSE-NEXT: movaps %xmm6, 192(%rdx)
300 ; SSE-NEXT: movaps %xmm12, 208(%rdx)
301 ; SSE-NEXT: movaps %xmm4, 160(%rdx)
302 ; SSE-NEXT: movaps %xmm13, 176(%rdx)
303 ; SSE-NEXT: movaps %xmm3, 128(%rdx)
304 ; SSE-NEXT: movaps %xmm15, 144(%rdx)
305 ; SSE-NEXT: movaps %xmm5, 96(%rdx)
306 ; SSE-NEXT: movaps %xmm14, 112(%rdx)
307 ; SSE-NEXT: movaps %xmm2, 64(%rdx)
308 ; SSE-NEXT: movaps %xmm10, 80(%rdx)
309 ; SSE-NEXT: movaps %xmm1, 32(%rdx)
310 ; SSE-NEXT: movaps %xmm9, 48(%rdx)
311 ; SSE-NEXT: movaps %xmm8, (%rdx)
312 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
313 ; SSE-NEXT: movaps %xmm0, 16(%rdx)
316 ; AVX1-ONLY-LABEL: store_i32_stride2_vf32:
317 ; AVX1-ONLY: # %bb.0:
318 ; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm0
319 ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1
320 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
321 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
322 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
323 ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm1
324 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm2
325 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
326 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
327 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
328 ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm2
329 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm3
330 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
331 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
332 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
333 ; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm3
334 ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm4
335 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
336 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
337 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
338 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm4
339 ; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm5
340 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm6
341 ; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm7
342 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8
343 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm9
344 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10
345 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm11
346 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm8[2],xmm4[2],xmm8[3],xmm4[3]
347 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
348 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm4, %ymm4
349 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm6[2],xmm10[3],xmm6[3]
350 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1]
351 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6
352 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm11[2],xmm7[2],xmm11[3],xmm7[3]
353 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1]
354 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7
355 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm9[2],xmm5[2],xmm9[3],xmm5[3]
356 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1]
357 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5
358 ; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rdx)
359 ; AVX1-ONLY-NEXT: vmovaps %ymm7, 96(%rdx)
360 ; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rdx)
361 ; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx)
362 ; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%rdx)
363 ; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rdx)
364 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rdx)
365 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rdx)
366 ; AVX1-ONLY-NEXT: vzeroupper
367 ; AVX1-ONLY-NEXT: retq
369 ; AVX2-ONLY-LABEL: store_i32_stride2_vf32:
370 ; AVX2-ONLY: # %bb.0:
371 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0
372 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1
373 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2
374 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm3
375 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm4
376 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm5
377 ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm6
378 ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm7
379 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[6],ymm4[6],ymm0[7],ymm4[7]
380 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[4],ymm4[4],ymm0[5],ymm4[5]
381 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm8[2,3]
382 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm8[0,1]
383 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7]
384 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[4],ymm5[4],ymm1[5],ymm5[5]
385 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3],ymm8[2,3]
386 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm8[0,1]
387 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[6],ymm6[6],ymm2[7],ymm6[7]
388 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[4],ymm6[4],ymm2[5],ymm6[5]
389 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm2[2,3],ymm8[2,3]
390 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[0,1],ymm8[0,1]
391 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm7[2],ymm3[3],ymm7[3],ymm3[6],ymm7[6],ymm3[7],ymm7[7]
392 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm3[0],ymm7[0],ymm3[1],ymm7[1],ymm3[4],ymm7[4],ymm3[5],ymm7[5]
393 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm3[2,3],ymm8[2,3]
394 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[0,1],ymm8[0,1]
395 ; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%rdx)
396 ; AVX2-ONLY-NEXT: vmovaps %ymm7, 224(%rdx)
397 ; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rdx)
398 ; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%rdx)
399 ; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx)
400 ; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%rdx)
401 ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx)
402 ; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rdx)
403 ; AVX2-ONLY-NEXT: vzeroupper
404 ; AVX2-ONLY-NEXT: retq
406 ; AVX512-LABEL: store_i32_stride2_vf32:
408 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
409 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1
410 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm2
411 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm3
412 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31]
413 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5
414 ; AVX512-NEXT: vpermt2d %zmm2, %zmm4, %zmm5
415 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23]
416 ; AVX512-NEXT: vpermt2d %zmm2, %zmm6, %zmm0
417 ; AVX512-NEXT: vpermi2d %zmm3, %zmm1, %zmm4
418 ; AVX512-NEXT: vpermt2d %zmm3, %zmm6, %zmm1
419 ; AVX512-NEXT: vmovdqa64 %zmm1, 128(%rdx)
420 ; AVX512-NEXT: vmovdqa64 %zmm4, 192(%rdx)
421 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
422 ; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rdx)
423 ; AVX512-NEXT: vzeroupper
425 %in.vec0 = load <32 x i32>, ptr %in.vecptr0, align 64
426 %in.vec1 = load <32 x i32>, ptr %in.vecptr1, align 64
427 %1 = shufflevector <32 x i32> %in.vec0, <32 x i32> %in.vec1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
428 %interleaved.vec = shufflevector <64 x i32> %1, <64 x i32> poison, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
429 store <64 x i32> %interleaved.vec, ptr %out.vec, align 64
433 define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
434 ; SSE-LABEL: store_i32_stride2_vf64:
436 ; SSE-NEXT: subq $152, %rsp
437 ; SSE-NEXT: movaps 112(%rdi), %xmm14
438 ; SSE-NEXT: movaps 96(%rdi), %xmm13
439 ; SSE-NEXT: movaps 80(%rdi), %xmm11
440 ; SSE-NEXT: movaps 64(%rdi), %xmm10
441 ; SSE-NEXT: movaps (%rdi), %xmm7
442 ; SSE-NEXT: movaps 16(%rdi), %xmm8
443 ; SSE-NEXT: movaps 32(%rdi), %xmm9
444 ; SSE-NEXT: movaps 48(%rdi), %xmm12
445 ; SSE-NEXT: movaps 96(%rsi), %xmm0
446 ; SSE-NEXT: movaps 80(%rsi), %xmm1
447 ; SSE-NEXT: movaps 64(%rsi), %xmm2
448 ; SSE-NEXT: movaps (%rsi), %xmm3
449 ; SSE-NEXT: movaps 16(%rsi), %xmm4
450 ; SSE-NEXT: movaps 32(%rsi), %xmm5
451 ; SSE-NEXT: movaps 48(%rsi), %xmm6
452 ; SSE-NEXT: movaps %xmm7, %xmm15
453 ; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1]
454 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
455 ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3]
456 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
457 ; SSE-NEXT: movaps %xmm8, %xmm7
458 ; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
459 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
460 ; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm4[2],xmm8[3],xmm4[3]
461 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
462 ; SSE-NEXT: movaps %xmm9, %xmm4
463 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
464 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
465 ; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm5[2],xmm9[3],xmm5[3]
466 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
467 ; SSE-NEXT: movaps %xmm12, %xmm4
468 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
469 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
470 ; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm6[2],xmm12[3],xmm6[3]
471 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
472 ; SSE-NEXT: movaps %xmm10, %xmm3
473 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
474 ; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill
475 ; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm2[2],xmm10[3],xmm2[3]
476 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
477 ; SSE-NEXT: movaps %xmm11, %xmm2
478 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
479 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
480 ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm1[2],xmm11[3],xmm1[3]
481 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
482 ; SSE-NEXT: movaps %xmm13, %xmm1
483 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
484 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
485 ; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3]
486 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
487 ; SSE-NEXT: movaps 112(%rsi), %xmm0
488 ; SSE-NEXT: movaps %xmm14, %xmm1
489 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
490 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
491 ; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm0[2],xmm14[3],xmm0[3]
492 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
493 ; SSE-NEXT: movaps 128(%rdi), %xmm15
494 ; SSE-NEXT: movaps 128(%rsi), %xmm0
495 ; SSE-NEXT: movaps %xmm15, %xmm1
496 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
497 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
498 ; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm0[2],xmm15[3],xmm0[3]
499 ; SSE-NEXT: movaps 144(%rdi), %xmm13
500 ; SSE-NEXT: movaps 144(%rsi), %xmm0
501 ; SSE-NEXT: movaps %xmm13, %xmm14
502 ; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
503 ; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3]
504 ; SSE-NEXT: movaps 160(%rdi), %xmm10
505 ; SSE-NEXT: movaps 160(%rsi), %xmm0
506 ; SSE-NEXT: movaps %xmm10, %xmm12
507 ; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1]
508 ; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3]
509 ; SSE-NEXT: movaps 176(%rdi), %xmm8
510 ; SSE-NEXT: movaps 176(%rsi), %xmm0
511 ; SSE-NEXT: movaps %xmm8, %xmm11
512 ; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1]
513 ; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm0[2],xmm8[3],xmm0[3]
514 ; SSE-NEXT: movaps 192(%rdi), %xmm6
515 ; SSE-NEXT: movaps 192(%rsi), %xmm0
516 ; SSE-NEXT: movaps %xmm6, %xmm9
517 ; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
518 ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3]
519 ; SSE-NEXT: movaps 208(%rdi), %xmm5
520 ; SSE-NEXT: movaps 208(%rsi), %xmm1
521 ; SSE-NEXT: movaps %xmm5, %xmm7
522 ; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
523 ; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3]
524 ; SSE-NEXT: movaps 224(%rdi), %xmm1
525 ; SSE-NEXT: movaps 224(%rsi), %xmm3
526 ; SSE-NEXT: movaps %xmm1, %xmm2
527 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
528 ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
529 ; SSE-NEXT: movaps 240(%rdi), %xmm3
530 ; SSE-NEXT: movaps 240(%rsi), %xmm4
531 ; SSE-NEXT: movaps %xmm3, %xmm0
532 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
533 ; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
534 ; SSE-NEXT: movaps %xmm3, 496(%rdx)
535 ; SSE-NEXT: movaps %xmm0, 480(%rdx)
536 ; SSE-NEXT: movaps %xmm1, 464(%rdx)
537 ; SSE-NEXT: movaps %xmm2, 448(%rdx)
538 ; SSE-NEXT: movaps %xmm5, 432(%rdx)
539 ; SSE-NEXT: movaps %xmm7, 416(%rdx)
540 ; SSE-NEXT: movaps %xmm6, 400(%rdx)
541 ; SSE-NEXT: movaps %xmm9, 384(%rdx)
542 ; SSE-NEXT: movaps %xmm8, 368(%rdx)
543 ; SSE-NEXT: movaps %xmm11, 352(%rdx)
544 ; SSE-NEXT: movaps %xmm10, 336(%rdx)
545 ; SSE-NEXT: movaps %xmm12, 320(%rdx)
546 ; SSE-NEXT: movaps %xmm13, 304(%rdx)
547 ; SSE-NEXT: movaps %xmm14, 288(%rdx)
548 ; SSE-NEXT: movaps %xmm15, 272(%rdx)
549 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
550 ; SSE-NEXT: movaps %xmm0, 256(%rdx)
551 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
552 ; SSE-NEXT: movaps %xmm0, 240(%rdx)
553 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
554 ; SSE-NEXT: movaps %xmm0, 224(%rdx)
555 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
556 ; SSE-NEXT: movaps %xmm0, 208(%rdx)
557 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
558 ; SSE-NEXT: movaps %xmm0, 192(%rdx)
559 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
560 ; SSE-NEXT: movaps %xmm0, 176(%rdx)
561 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
562 ; SSE-NEXT: movaps %xmm0, 160(%rdx)
563 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
564 ; SSE-NEXT: movaps %xmm0, 144(%rdx)
565 ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
566 ; SSE-NEXT: movaps %xmm0, 128(%rdx)
567 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
568 ; SSE-NEXT: movaps %xmm0, 112(%rdx)
569 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
570 ; SSE-NEXT: movaps %xmm0, 96(%rdx)
571 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
572 ; SSE-NEXT: movaps %xmm0, 80(%rdx)
573 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
574 ; SSE-NEXT: movaps %xmm0, 64(%rdx)
575 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
576 ; SSE-NEXT: movaps %xmm0, 48(%rdx)
577 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
578 ; SSE-NEXT: movaps %xmm0, 32(%rdx)
579 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
580 ; SSE-NEXT: movaps %xmm0, 16(%rdx)
581 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
582 ; SSE-NEXT: movaps %xmm0, (%rdx)
583 ; SSE-NEXT: addq $152, %rsp
586 ; AVX1-ONLY-LABEL: store_i32_stride2_vf64:
587 ; AVX1-ONLY: # %bb.0:
588 ; AVX1-ONLY-NEXT: vmovaps 240(%rsi), %xmm0
589 ; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm1
590 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
591 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
592 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
593 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
594 ; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm1
595 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm2
596 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
597 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
598 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm0
599 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
600 ; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm2
601 ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm3
602 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
603 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
604 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
605 ; AVX1-ONLY-NEXT: vmovaps 144(%rsi), %xmm3
606 ; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm4
607 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
608 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
609 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
610 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm4
611 ; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm8
612 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm5
613 ; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm11
614 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm6
615 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm9
616 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm7
617 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm6[2],xmm4[2],xmm6[3],xmm4[3]
618 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
619 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm4, %ymm4
620 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm7[2],xmm5[2],xmm7[3],xmm5[3]
621 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
622 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
623 ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm6
624 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm7
625 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
626 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
627 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm6, %ymm6
628 ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm7
629 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm10
630 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm10[2],xmm7[2],xmm10[3],xmm7[3]
631 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1]
632 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm7
633 ; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm10
634 ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm12
635 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm12[2],xmm10[2],xmm12[3],xmm10[3]
636 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
637 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm10
638 ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm12
639 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm13
640 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm13[2],xmm12[2],xmm13[3],xmm12[3]
641 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
642 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm12, %ymm12
643 ; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm13
644 ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm14
645 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
646 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
647 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm14
648 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13
649 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm14[2],xmm11[2],xmm14[3],xmm11[3]
650 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm14[0],xmm11[0],xmm14[1],xmm11[1]
651 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm11, %ymm11
652 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
653 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
654 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm8, %ymm8
655 ; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm9
656 ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm14
657 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm14[2],xmm9[2],xmm14[3],xmm9[3]
658 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm14[0],xmm9[0],xmm14[1],xmm9[1]
659 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm9, %ymm9
660 ; AVX1-ONLY-NEXT: vmovaps 176(%rsi), %xmm14
661 ; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm15
662 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
663 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
664 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm1
665 ; AVX1-ONLY-NEXT: vmovaps 208(%rsi), %xmm14
666 ; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm15
667 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
668 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
669 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0
670 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rdx)
671 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rdx)
672 ; AVX1-ONLY-NEXT: vmovaps %ymm9, 224(%rdx)
673 ; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rdx)
674 ; AVX1-ONLY-NEXT: vmovaps %ymm11, 96(%rdx)
675 ; AVX1-ONLY-NEXT: vmovaps %ymm13, 160(%rdx)
676 ; AVX1-ONLY-NEXT: vmovaps %ymm12, 384(%rdx)
677 ; AVX1-ONLY-NEXT: vmovaps %ymm10, 320(%rdx)
678 ; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%rdx)
679 ; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rdx)
680 ; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rdx)
681 ; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx)
682 ; AVX1-ONLY-NEXT: vmovaps %ymm3, 288(%rdx)
683 ; AVX1-ONLY-NEXT: vmovaps %ymm2, 256(%rdx)
684 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
685 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rdx)
686 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
687 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rdx)
688 ; AVX1-ONLY-NEXT: vzeroupper
689 ; AVX1-ONLY-NEXT: retq
691 ; AVX2-ONLY-LABEL: store_i32_stride2_vf64:
692 ; AVX2-ONLY: # %bb.0:
693 ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm0
694 ; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm3
695 ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm6
696 ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm8
697 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1
698 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm4
699 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm7
700 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm9
701 ; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm10
702 ; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm11
703 ; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm12
704 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm2
705 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm5
706 ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm13
707 ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm14
708 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
709 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5]
710 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm15[2,3]
711 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
712 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[0,1],ymm15[0,1]
713 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7]
714 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5]
715 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm15[2,3]
716 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[0,1],ymm15[0,1]
717 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm7[2],ymm13[2],ymm7[3],ymm13[3],ymm7[6],ymm13[6],ymm7[7],ymm13[7]
718 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm7[0],ymm13[0],ymm7[1],ymm13[1],ymm7[4],ymm13[4],ymm7[5],ymm13[5]
719 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm13[2,3],ymm15[2,3]
720 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[0,1],ymm15[0,1]
721 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm9[2],ymm14[2],ymm9[3],ymm14[3],ymm9[6],ymm14[6],ymm9[7],ymm14[7]
722 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm9[0],ymm14[0],ymm9[1],ymm14[1],ymm9[4],ymm14[4],ymm9[5],ymm14[5]
723 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm9[2,3],ymm15[2,3]
724 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[0,1],ymm15[0,1]
725 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[6],ymm12[6],ymm8[7],ymm12[7]
726 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm8[0],ymm12[0],ymm8[1],ymm12[1],ymm8[4],ymm12[4],ymm8[5],ymm12[5]
727 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm8[2,3],ymm15[2,3]
728 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[0,1],ymm15[0,1]
729 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm6[2],ymm11[2],ymm6[3],ymm11[3],ymm6[6],ymm11[6],ymm6[7],ymm11[7]
730 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm11[0],ymm6[1],ymm11[1],ymm6[4],ymm11[4],ymm6[5],ymm11[5]
731 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm6[2,3],ymm15[2,3]
732 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[0,1],ymm15[0,1]
733 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[6],ymm10[6],ymm3[7],ymm10[7]
734 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm3[0],ymm10[0],ymm3[1],ymm10[1],ymm3[4],ymm10[4],ymm3[5],ymm10[5]
735 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm3[2,3],ymm15[2,3]
736 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[0,1],ymm15[0,1]
737 ; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm15
738 ; AVX2-ONLY-NEXT: vmovaps %ymm0, %ymm1
739 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm15[2],ymm0[3],ymm15[3],ymm0[6],ymm15[6],ymm0[7],ymm15[7]
740 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5]
741 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm15[2,3],ymm0[2,3]
742 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[0,1],ymm0[0,1]
743 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rdx)
744 ; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rdx)
745 ; AVX2-ONLY-NEXT: vmovaps %ymm3, 384(%rdx)
746 ; AVX2-ONLY-NEXT: vmovaps %ymm10, 416(%rdx)
747 ; AVX2-ONLY-NEXT: vmovaps %ymm6, 320(%rdx)
748 ; AVX2-ONLY-NEXT: vmovaps %ymm11, 352(%rdx)
749 ; AVX2-ONLY-NEXT: vmovaps %ymm8, 256(%rdx)
750 ; AVX2-ONLY-NEXT: vmovaps %ymm12, 288(%rdx)
751 ; AVX2-ONLY-NEXT: vmovaps %ymm9, 192(%rdx)
752 ; AVX2-ONLY-NEXT: vmovaps %ymm14, 224(%rdx)
753 ; AVX2-ONLY-NEXT: vmovaps %ymm13, 128(%rdx)
754 ; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%rdx)
755 ; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rdx)
756 ; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rdx)
757 ; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx)
758 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
759 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx)
760 ; AVX2-ONLY-NEXT: vzeroupper
761 ; AVX2-ONLY-NEXT: retq
763 ; AVX512-LABEL: store_i32_stride2_vf64:
765 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
766 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1
767 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2
768 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3
769 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm4
770 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm5
771 ; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm6
772 ; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm7
773 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31]
774 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9
775 ; AVX512-NEXT: vpermt2d %zmm4, %zmm8, %zmm9
776 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23]
777 ; AVX512-NEXT: vpermt2d %zmm4, %zmm10, %zmm0
778 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4
779 ; AVX512-NEXT: vpermt2d %zmm5, %zmm8, %zmm4
780 ; AVX512-NEXT: vpermt2d %zmm5, %zmm10, %zmm1
781 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5
782 ; AVX512-NEXT: vpermt2d %zmm6, %zmm8, %zmm5
783 ; AVX512-NEXT: vpermt2d %zmm6, %zmm10, %zmm2
784 ; AVX512-NEXT: vpermi2d %zmm7, %zmm3, %zmm8
785 ; AVX512-NEXT: vpermt2d %zmm7, %zmm10, %zmm3
786 ; AVX512-NEXT: vmovdqa64 %zmm3, 384(%rdx)
787 ; AVX512-NEXT: vmovdqa64 %zmm8, 448(%rdx)
788 ; AVX512-NEXT: vmovdqa64 %zmm2, 256(%rdx)
789 ; AVX512-NEXT: vmovdqa64 %zmm5, 320(%rdx)
790 ; AVX512-NEXT: vmovdqa64 %zmm1, 128(%rdx)
791 ; AVX512-NEXT: vmovdqa64 %zmm4, 192(%rdx)
792 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
793 ; AVX512-NEXT: vmovdqa64 %zmm9, 64(%rdx)
794 ; AVX512-NEXT: vzeroupper
796 %in.vec0 = load <64 x i32>, ptr %in.vecptr0, align 64
797 %in.vec1 = load <64 x i32>, ptr %in.vecptr1, align 64
798 %1 = shufflevector <64 x i32> %in.vec0, <64 x i32> %in.vec1, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
799 %interleaved.vec = shufflevector <128 x i32> %1, <128 x i32> poison, <128 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
800 store <128 x i32> %interleaved.vec, ptr %out.vec, align 64
803 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
806 ; AVX2-FAST-PERLANE: {{.*}}
808 ; AVX512-FAST: {{.*}}
809 ; AVX512-SLOW: {{.*}}
811 ; AVX512BW-FAST: {{.*}}
812 ; AVX512BW-ONLY-FAST: {{.*}}
813 ; AVX512BW-ONLY-SLOW: {{.*}}
814 ; AVX512BW-SLOW: {{.*}}
815 ; AVX512DQ-FAST: {{.*}}
816 ; AVX512DQ-SLOW: {{.*}}
817 ; AVX512DQBW-FAST: {{.*}}
818 ; AVX512DQBW-SLOW: {{.*}}
820 ; AVX512F-FAST: {{.*}}
821 ; AVX512F-ONLY-FAST: {{.*}}
822 ; AVX512F-ONLY-SLOW: {{.*}}
823 ; AVX512F-SLOW: {{.*}}