1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
16 ; These patterns are produced by LoopVectorizer for interleaved loads.
18 define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind {
19 ; SSE-LABEL: load_i32_stride2_vf2:
21 ; SSE-NEXT: movdqa (%rdi), %xmm0
22 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
23 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
24 ; SSE-NEXT: movq %xmm1, (%rsi)
25 ; SSE-NEXT: movq %xmm0, (%rdx)
28 ; AVX-LABEL: load_i32_stride2_vf2:
30 ; AVX-NEXT: vmovaps (%rdi), %xmm0
31 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
32 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
33 ; AVX-NEXT: vmovlps %xmm1, (%rsi)
34 ; AVX-NEXT: vmovlps %xmm0, (%rdx)
37 ; AVX2-LABEL: load_i32_stride2_vf2:
39 ; AVX2-NEXT: vmovaps (%rdi), %xmm0
40 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
41 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
42 ; AVX2-NEXT: vmovlps %xmm1, (%rsi)
43 ; AVX2-NEXT: vmovlps %xmm0, (%rdx)
46 ; AVX2-FP-LABEL: load_i32_stride2_vf2:
48 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm0
49 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
50 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
51 ; AVX2-FP-NEXT: vmovlps %xmm1, (%rsi)
52 ; AVX2-FP-NEXT: vmovlps %xmm0, (%rdx)
55 ; AVX2-FCP-LABEL: load_i32_stride2_vf2:
57 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm0
58 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
59 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
60 ; AVX2-FCP-NEXT: vmovlps %xmm1, (%rsi)
61 ; AVX2-FCP-NEXT: vmovlps %xmm0, (%rdx)
64 ; AVX512-LABEL: load_i32_stride2_vf2:
66 ; AVX512-NEXT: vmovaps (%rdi), %xmm0
67 ; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
68 ; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
69 ; AVX512-NEXT: vmovlps %xmm1, (%rsi)
70 ; AVX512-NEXT: vmovlps %xmm0, (%rdx)
73 ; AVX512-FCP-LABEL: load_i32_stride2_vf2:
74 ; AVX512-FCP: # %bb.0:
75 ; AVX512-FCP-NEXT: vmovaps (%rdi), %xmm0
76 ; AVX512-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
77 ; AVX512-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
78 ; AVX512-FCP-NEXT: vmovlps %xmm1, (%rsi)
79 ; AVX512-FCP-NEXT: vmovlps %xmm0, (%rdx)
80 ; AVX512-FCP-NEXT: retq
82 ; AVX512DQ-LABEL: load_i32_stride2_vf2:
84 ; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0
85 ; AVX512DQ-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
86 ; AVX512DQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
87 ; AVX512DQ-NEXT: vmovlps %xmm1, (%rsi)
88 ; AVX512DQ-NEXT: vmovlps %xmm0, (%rdx)
91 ; AVX512DQ-FCP-LABEL: load_i32_stride2_vf2:
92 ; AVX512DQ-FCP: # %bb.0:
93 ; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %xmm0
94 ; AVX512DQ-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
95 ; AVX512DQ-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
96 ; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%rsi)
97 ; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rdx)
98 ; AVX512DQ-FCP-NEXT: retq
100 ; AVX512BW-LABEL: load_i32_stride2_vf2:
102 ; AVX512BW-NEXT: vmovaps (%rdi), %xmm0
103 ; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
104 ; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
105 ; AVX512BW-NEXT: vmovlps %xmm1, (%rsi)
106 ; AVX512BW-NEXT: vmovlps %xmm0, (%rdx)
107 ; AVX512BW-NEXT: retq
109 ; AVX512BW-FCP-LABEL: load_i32_stride2_vf2:
110 ; AVX512BW-FCP: # %bb.0:
111 ; AVX512BW-FCP-NEXT: vmovaps (%rdi), %xmm0
112 ; AVX512BW-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
113 ; AVX512BW-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
114 ; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%rsi)
115 ; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rdx)
116 ; AVX512BW-FCP-NEXT: retq
118 ; AVX512DQ-BW-LABEL: load_i32_stride2_vf2:
119 ; AVX512DQ-BW: # %bb.0:
120 ; AVX512DQ-BW-NEXT: vmovaps (%rdi), %xmm0
121 ; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
122 ; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
123 ; AVX512DQ-BW-NEXT: vmovlps %xmm1, (%rsi)
124 ; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%rdx)
125 ; AVX512DQ-BW-NEXT: retq
127 ; AVX512DQ-BW-FCP-LABEL: load_i32_stride2_vf2:
128 ; AVX512DQ-BW-FCP: # %bb.0:
129 ; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %xmm0
130 ; AVX512DQ-BW-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
131 ; AVX512DQ-BW-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
132 ; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%rsi)
133 ; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rdx)
134 ; AVX512DQ-BW-FCP-NEXT: retq
135 %wide.vec = load <4 x i32>, ptr %in.vec, align 64
136 %strided.vec0 = shufflevector <4 x i32> %wide.vec, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
137 %strided.vec1 = shufflevector <4 x i32> %wide.vec, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
138 store <2 x i32> %strided.vec0, ptr %out.vec0, align 64
139 store <2 x i32> %strided.vec1, ptr %out.vec1, align 64
143 define void @load_i32_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind {
144 ; SSE-LABEL: load_i32_stride2_vf4:
146 ; SSE-NEXT: movaps (%rdi), %xmm0
147 ; SSE-NEXT: movaps 16(%rdi), %xmm1
148 ; SSE-NEXT: movaps %xmm0, %xmm2
149 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
150 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
151 ; SSE-NEXT: movaps %xmm2, (%rsi)
152 ; SSE-NEXT: movaps %xmm0, (%rdx)
155 ; AVX-LABEL: load_i32_stride2_vf4:
157 ; AVX-NEXT: vmovaps (%rdi), %xmm0
158 ; AVX-NEXT: vmovaps 16(%rdi), %xmm1
159 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,2]
160 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
161 ; AVX-NEXT: vmovaps %xmm2, (%rsi)
162 ; AVX-NEXT: vmovaps %xmm0, (%rdx)
165 ; AVX2-LABEL: load_i32_stride2_vf4:
167 ; AVX2-NEXT: vmovaps (%rdi), %xmm0
168 ; AVX2-NEXT: vmovaps 16(%rdi), %xmm1
169 ; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,2]
170 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
171 ; AVX2-NEXT: vmovaps %xmm2, (%rsi)
172 ; AVX2-NEXT: vmovaps %xmm0, (%rdx)
175 ; AVX2-FP-LABEL: load_i32_stride2_vf4:
177 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm0
178 ; AVX2-FP-NEXT: vmovaps 16(%rdi), %xmm1
179 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,2]
180 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
181 ; AVX2-FP-NEXT: vmovaps %xmm2, (%rsi)
182 ; AVX2-FP-NEXT: vmovaps %xmm0, (%rdx)
185 ; AVX2-FCP-LABEL: load_i32_stride2_vf4:
187 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm0
188 ; AVX2-FCP-NEXT: vmovaps 16(%rdi), %xmm1
189 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,2]
190 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
191 ; AVX2-FCP-NEXT: vmovaps %xmm2, (%rsi)
192 ; AVX2-FCP-NEXT: vmovaps %xmm0, (%rdx)
193 ; AVX2-FCP-NEXT: retq
195 ; AVX512-LABEL: load_i32_stride2_vf4:
197 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0
198 ; AVX512-NEXT: vmovaps (%rdi), %xmm1
199 ; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],mem[1,3]
200 ; AVX512-NEXT: vpmovqd %ymm0, (%rsi)
201 ; AVX512-NEXT: vmovaps %xmm1, (%rdx)
202 ; AVX512-NEXT: vzeroupper
205 ; AVX512-FCP-LABEL: load_i32_stride2_vf4:
206 ; AVX512-FCP: # %bb.0:
207 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0
208 ; AVX512-FCP-NEXT: vmovaps (%rdi), %xmm1
209 ; AVX512-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],mem[1,3]
210 ; AVX512-FCP-NEXT: vpmovqd %ymm0, (%rsi)
211 ; AVX512-FCP-NEXT: vmovaps %xmm1, (%rdx)
212 ; AVX512-FCP-NEXT: vzeroupper
213 ; AVX512-FCP-NEXT: retq
215 ; AVX512DQ-LABEL: load_i32_stride2_vf4:
217 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
218 ; AVX512DQ-NEXT: vmovaps (%rdi), %xmm1
219 ; AVX512DQ-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],mem[1,3]
220 ; AVX512DQ-NEXT: vpmovqd %ymm0, (%rsi)
221 ; AVX512DQ-NEXT: vmovaps %xmm1, (%rdx)
222 ; AVX512DQ-NEXT: vzeroupper
223 ; AVX512DQ-NEXT: retq
225 ; AVX512DQ-FCP-LABEL: load_i32_stride2_vf4:
226 ; AVX512DQ-FCP: # %bb.0:
227 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0
228 ; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %xmm1
229 ; AVX512DQ-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],mem[1,3]
230 ; AVX512DQ-FCP-NEXT: vpmovqd %ymm0, (%rsi)
231 ; AVX512DQ-FCP-NEXT: vmovaps %xmm1, (%rdx)
232 ; AVX512DQ-FCP-NEXT: vzeroupper
233 ; AVX512DQ-FCP-NEXT: retq
235 ; AVX512BW-LABEL: load_i32_stride2_vf4:
237 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
238 ; AVX512BW-NEXT: vmovaps (%rdi), %xmm1
239 ; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],mem[1,3]
240 ; AVX512BW-NEXT: vpmovqd %ymm0, (%rsi)
241 ; AVX512BW-NEXT: vmovaps %xmm1, (%rdx)
242 ; AVX512BW-NEXT: vzeroupper
243 ; AVX512BW-NEXT: retq
245 ; AVX512BW-FCP-LABEL: load_i32_stride2_vf4:
246 ; AVX512BW-FCP: # %bb.0:
247 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0
248 ; AVX512BW-FCP-NEXT: vmovaps (%rdi), %xmm1
249 ; AVX512BW-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],mem[1,3]
250 ; AVX512BW-FCP-NEXT: vpmovqd %ymm0, (%rsi)
251 ; AVX512BW-FCP-NEXT: vmovaps %xmm1, (%rdx)
252 ; AVX512BW-FCP-NEXT: vzeroupper
253 ; AVX512BW-FCP-NEXT: retq
255 ; AVX512DQ-BW-LABEL: load_i32_stride2_vf4:
256 ; AVX512DQ-BW: # %bb.0:
257 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0
258 ; AVX512DQ-BW-NEXT: vmovaps (%rdi), %xmm1
259 ; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],mem[1,3]
260 ; AVX512DQ-BW-NEXT: vpmovqd %ymm0, (%rsi)
261 ; AVX512DQ-BW-NEXT: vmovaps %xmm1, (%rdx)
262 ; AVX512DQ-BW-NEXT: vzeroupper
263 ; AVX512DQ-BW-NEXT: retq
265 ; AVX512DQ-BW-FCP-LABEL: load_i32_stride2_vf4:
266 ; AVX512DQ-BW-FCP: # %bb.0:
267 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0
268 ; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %xmm1
269 ; AVX512DQ-BW-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],mem[1,3]
270 ; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm0, (%rsi)
271 ; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm1, (%rdx)
272 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
273 ; AVX512DQ-BW-FCP-NEXT: retq
274 %wide.vec = load <8 x i32>, ptr %in.vec, align 64
275 %strided.vec0 = shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
276 %strided.vec1 = shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
277 store <4 x i32> %strided.vec0, ptr %out.vec0, align 64
278 store <4 x i32> %strided.vec1, ptr %out.vec1, align 64
282 define void @load_i32_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind {
283 ; SSE-LABEL: load_i32_stride2_vf8:
285 ; SSE-NEXT: movaps (%rdi), %xmm0
286 ; SSE-NEXT: movaps 16(%rdi), %xmm1
287 ; SSE-NEXT: movaps 32(%rdi), %xmm2
288 ; SSE-NEXT: movaps 48(%rdi), %xmm3
289 ; SSE-NEXT: movaps %xmm2, %xmm4
290 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,2]
291 ; SSE-NEXT: movaps %xmm0, %xmm5
292 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2]
293 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
294 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
295 ; SSE-NEXT: movaps %xmm5, (%rsi)
296 ; SSE-NEXT: movaps %xmm4, 16(%rsi)
297 ; SSE-NEXT: movaps %xmm0, (%rdx)
298 ; SSE-NEXT: movaps %xmm2, 16(%rdx)
301 ; AVX-LABEL: load_i32_stride2_vf8:
303 ; AVX-NEXT: vmovaps (%rdi), %ymm0
304 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],mem[2,3]
305 ; AVX-NEXT: vinsertf128 $1, 32(%rdi), %ymm0, %ymm0
306 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
307 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
308 ; AVX-NEXT: vmovaps %ymm2, (%rsi)
309 ; AVX-NEXT: vmovaps %ymm0, (%rdx)
310 ; AVX-NEXT: vzeroupper
313 ; AVX2-LABEL: load_i32_stride2_vf8:
315 ; AVX2-NEXT: vmovaps (%rdi), %ymm0
316 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm1
317 ; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
318 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
319 ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
320 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
321 ; AVX2-NEXT: vmovaps %ymm2, (%rsi)
322 ; AVX2-NEXT: vmovaps %ymm0, (%rdx)
323 ; AVX2-NEXT: vzeroupper
326 ; AVX2-FP-LABEL: load_i32_stride2_vf8:
328 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0
329 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm1
330 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
331 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
332 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
333 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
334 ; AVX2-FP-NEXT: vmovaps %ymm2, (%rsi)
335 ; AVX2-FP-NEXT: vmovaps %ymm0, (%rdx)
336 ; AVX2-FP-NEXT: vzeroupper
339 ; AVX2-FCP-LABEL: load_i32_stride2_vf8:
341 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0
342 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm1
343 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
344 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
345 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
346 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
347 ; AVX2-FCP-NEXT: vmovaps %ymm2, (%rsi)
348 ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rdx)
349 ; AVX2-FCP-NEXT: vzeroupper
350 ; AVX2-FCP-NEXT: retq
352 ; AVX512-LABEL: load_i32_stride2_vf8:
354 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
355 ; AVX512-NEXT: vmovaps (%rdi), %ymm1
356 ; AVX512-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],mem[1,3],ymm1[5,7],mem[5,7]
357 ; AVX512-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3]
358 ; AVX512-NEXT: vpmovqd %zmm0, (%rsi)
359 ; AVX512-NEXT: vmovaps %ymm1, (%rdx)
360 ; AVX512-NEXT: vzeroupper
363 ; AVX512-FCP-LABEL: load_i32_stride2_vf8:
364 ; AVX512-FCP: # %bb.0:
365 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
366 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1
367 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15]
368 ; AVX512-FCP-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm2
369 ; AVX512-FCP-NEXT: vpmovqd %zmm0, (%rsi)
370 ; AVX512-FCP-NEXT: vmovdqa %ymm2, (%rdx)
371 ; AVX512-FCP-NEXT: vzeroupper
372 ; AVX512-FCP-NEXT: retq
374 ; AVX512DQ-LABEL: load_i32_stride2_vf8:
376 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0
377 ; AVX512DQ-NEXT: vmovaps (%rdi), %ymm1
378 ; AVX512DQ-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],mem[1,3],ymm1[5,7],mem[5,7]
379 ; AVX512DQ-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3]
380 ; AVX512DQ-NEXT: vpmovqd %zmm0, (%rsi)
381 ; AVX512DQ-NEXT: vmovaps %ymm1, (%rdx)
382 ; AVX512DQ-NEXT: vzeroupper
383 ; AVX512DQ-NEXT: retq
385 ; AVX512DQ-FCP-LABEL: load_i32_stride2_vf8:
386 ; AVX512DQ-FCP: # %bb.0:
387 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
388 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1
389 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15]
390 ; AVX512DQ-FCP-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm2
391 ; AVX512DQ-FCP-NEXT: vpmovqd %zmm0, (%rsi)
392 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%rdx)
393 ; AVX512DQ-FCP-NEXT: vzeroupper
394 ; AVX512DQ-FCP-NEXT: retq
396 ; AVX512BW-LABEL: load_i32_stride2_vf8:
398 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
399 ; AVX512BW-NEXT: vmovaps (%rdi), %ymm1
400 ; AVX512BW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],mem[1,3],ymm1[5,7],mem[5,7]
401 ; AVX512BW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3]
402 ; AVX512BW-NEXT: vpmovqd %zmm0, (%rsi)
403 ; AVX512BW-NEXT: vmovaps %ymm1, (%rdx)
404 ; AVX512BW-NEXT: vzeroupper
405 ; AVX512BW-NEXT: retq
407 ; AVX512BW-FCP-LABEL: load_i32_stride2_vf8:
408 ; AVX512BW-FCP: # %bb.0:
409 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
410 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
411 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15]
412 ; AVX512BW-FCP-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm2
413 ; AVX512BW-FCP-NEXT: vpmovqd %zmm0, (%rsi)
414 ; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rdx)
415 ; AVX512BW-FCP-NEXT: vzeroupper
416 ; AVX512BW-FCP-NEXT: retq
418 ; AVX512DQ-BW-LABEL: load_i32_stride2_vf8:
419 ; AVX512DQ-BW: # %bb.0:
420 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
421 ; AVX512DQ-BW-NEXT: vmovaps (%rdi), %ymm1
422 ; AVX512DQ-BW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],mem[1,3],ymm1[5,7],mem[5,7]
423 ; AVX512DQ-BW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3]
424 ; AVX512DQ-BW-NEXT: vpmovqd %zmm0, (%rsi)
425 ; AVX512DQ-BW-NEXT: vmovaps %ymm1, (%rdx)
426 ; AVX512DQ-BW-NEXT: vzeroupper
427 ; AVX512DQ-BW-NEXT: retq
429 ; AVX512DQ-BW-FCP-LABEL: load_i32_stride2_vf8:
430 ; AVX512DQ-BW-FCP: # %bb.0:
431 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
432 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
433 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15]
434 ; AVX512DQ-BW-FCP-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm2
435 ; AVX512DQ-BW-FCP-NEXT: vpmovqd %zmm0, (%rsi)
436 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rdx)
437 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
438 ; AVX512DQ-BW-FCP-NEXT: retq
439 %wide.vec = load <16 x i32>, ptr %in.vec, align 64
440 %strided.vec0 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
441 %strided.vec1 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
442 store <8 x i32> %strided.vec0, ptr %out.vec0, align 64
443 store <8 x i32> %strided.vec1, ptr %out.vec1, align 64
447 define void @load_i32_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind {
448 ; SSE-LABEL: load_i32_stride2_vf16:
450 ; SSE-NEXT: movaps (%rdi), %xmm0
451 ; SSE-NEXT: movaps 16(%rdi), %xmm1
452 ; SSE-NEXT: movaps 32(%rdi), %xmm2
453 ; SSE-NEXT: movaps 48(%rdi), %xmm3
454 ; SSE-NEXT: movaps 80(%rdi), %xmm4
455 ; SSE-NEXT: movaps 64(%rdi), %xmm5
456 ; SSE-NEXT: movaps 112(%rdi), %xmm6
457 ; SSE-NEXT: movaps 96(%rdi), %xmm7
458 ; SSE-NEXT: movaps %xmm7, %xmm8
459 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm6[0,2]
460 ; SSE-NEXT: movaps %xmm5, %xmm9
461 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm4[0,2]
462 ; SSE-NEXT: movaps %xmm2, %xmm10
463 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm3[0,2]
464 ; SSE-NEXT: movaps %xmm0, %xmm11
465 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm1[0,2]
466 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm6[1,3]
467 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm4[1,3]
468 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
469 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
470 ; SSE-NEXT: movaps %xmm9, 32(%rsi)
471 ; SSE-NEXT: movaps %xmm8, 48(%rsi)
472 ; SSE-NEXT: movaps %xmm11, (%rsi)
473 ; SSE-NEXT: movaps %xmm10, 16(%rsi)
474 ; SSE-NEXT: movaps %xmm5, 32(%rdx)
475 ; SSE-NEXT: movaps %xmm7, 48(%rdx)
476 ; SSE-NEXT: movaps %xmm0, (%rdx)
477 ; SSE-NEXT: movaps %xmm2, 16(%rdx)
480 ; AVX-LABEL: load_i32_stride2_vf16:
482 ; AVX-NEXT: vmovaps (%rdi), %ymm0
483 ; AVX-NEXT: vmovaps 64(%rdi), %ymm1
484 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3]
485 ; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm1
486 ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,2],ymm2[0,2],ymm1[4,6],ymm2[4,6]
487 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],mem[2,3]
488 ; AVX-NEXT: vinsertf128 $1, 32(%rdi), %ymm0, %ymm0
489 ; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,2],ymm4[0,2],ymm0[4,6],ymm4[4,6]
490 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],ymm2[1,3],ymm1[5,7],ymm2[5,7]
491 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm4[1,3],ymm0[5,7],ymm4[5,7]
492 ; AVX-NEXT: vmovaps %ymm5, (%rsi)
493 ; AVX-NEXT: vmovaps %ymm3, 32(%rsi)
494 ; AVX-NEXT: vmovaps %ymm0, (%rdx)
495 ; AVX-NEXT: vmovaps %ymm1, 32(%rdx)
496 ; AVX-NEXT: vzeroupper
499 ; AVX2-LABEL: load_i32_stride2_vf16:
501 ; AVX2-NEXT: vmovaps (%rdi), %ymm0
502 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm1
503 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm2
504 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm3
505 ; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm2[0,2],ymm3[0,2],ymm2[4,6],ymm3[4,6]
506 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3]
507 ; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
508 ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,1,3]
509 ; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm3[1,3],ymm2[5,7],ymm3[5,7]
510 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
511 ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
512 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
513 ; AVX2-NEXT: vmovaps %ymm5, (%rsi)
514 ; AVX2-NEXT: vmovaps %ymm4, 32(%rsi)
515 ; AVX2-NEXT: vmovaps %ymm0, (%rdx)
516 ; AVX2-NEXT: vmovaps %ymm2, 32(%rdx)
517 ; AVX2-NEXT: vzeroupper
520 ; AVX2-FP-LABEL: load_i32_stride2_vf16:
522 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0
523 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm1
524 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm2
525 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm3
526 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm2[0,2],ymm3[0,2],ymm2[4,6],ymm3[4,6]
527 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3]
528 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
529 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,1,3]
530 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm3[1,3],ymm2[5,7],ymm3[5,7]
531 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
532 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
533 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
534 ; AVX2-FP-NEXT: vmovaps %ymm5, (%rsi)
535 ; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rsi)
536 ; AVX2-FP-NEXT: vmovaps %ymm0, (%rdx)
537 ; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rdx)
538 ; AVX2-FP-NEXT: vzeroupper
541 ; AVX2-FCP-LABEL: load_i32_stride2_vf16:
543 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0
544 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm1
545 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm2
546 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm3
547 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm2[0,2],ymm3[0,2],ymm2[4,6],ymm3[4,6]
548 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3]
549 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
550 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,1,3]
551 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm3[1,3],ymm2[5,7],ymm3[5,7]
552 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
553 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
554 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
555 ; AVX2-FCP-NEXT: vmovaps %ymm5, (%rsi)
556 ; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rsi)
557 ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rdx)
558 ; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rdx)
559 ; AVX2-FCP-NEXT: vzeroupper
560 ; AVX2-FCP-NEXT: retq
562 ; AVX512-LABEL: load_i32_stride2_vf16:
564 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
565 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1
566 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
567 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
568 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
569 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
570 ; AVX512-NEXT: vmovdqa64 %zmm2, (%rsi)
571 ; AVX512-NEXT: vmovdqa64 %zmm3, (%rdx)
572 ; AVX512-NEXT: vzeroupper
575 ; AVX512-FCP-LABEL: load_i32_stride2_vf16:
576 ; AVX512-FCP: # %bb.0:
577 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
578 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
579 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
580 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
581 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
582 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
583 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rsi)
584 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rdx)
585 ; AVX512-FCP-NEXT: vzeroupper
586 ; AVX512-FCP-NEXT: retq
588 ; AVX512DQ-LABEL: load_i32_stride2_vf16:
590 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0
591 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1
592 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
593 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
594 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
595 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
596 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rsi)
597 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rdx)
598 ; AVX512DQ-NEXT: vzeroupper
599 ; AVX512DQ-NEXT: retq
601 ; AVX512DQ-FCP-LABEL: load_i32_stride2_vf16:
602 ; AVX512DQ-FCP: # %bb.0:
603 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
604 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
605 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
606 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
607 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
608 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
609 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rsi)
610 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rdx)
611 ; AVX512DQ-FCP-NEXT: vzeroupper
612 ; AVX512DQ-FCP-NEXT: retq
614 ; AVX512BW-LABEL: load_i32_stride2_vf16:
616 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
617 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
618 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
619 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
620 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
621 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
622 ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rsi)
623 ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rdx)
624 ; AVX512BW-NEXT: vzeroupper
625 ; AVX512BW-NEXT: retq
627 ; AVX512BW-FCP-LABEL: load_i32_stride2_vf16:
628 ; AVX512BW-FCP: # %bb.0:
629 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
630 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
631 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
632 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
633 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
634 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
635 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi)
636 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rdx)
637 ; AVX512BW-FCP-NEXT: vzeroupper
638 ; AVX512BW-FCP-NEXT: retq
640 ; AVX512DQ-BW-LABEL: load_i32_stride2_vf16:
641 ; AVX512DQ-BW: # %bb.0:
642 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
643 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1
644 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
645 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
646 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
647 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
648 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rsi)
649 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rdx)
650 ; AVX512DQ-BW-NEXT: vzeroupper
651 ; AVX512DQ-BW-NEXT: retq
653 ; AVX512DQ-BW-FCP-LABEL: load_i32_stride2_vf16:
654 ; AVX512DQ-BW-FCP: # %bb.0:
655 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
656 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
657 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
658 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
659 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
660 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
661 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi)
662 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rdx)
663 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
664 ; AVX512DQ-BW-FCP-NEXT: retq
665 %wide.vec = load <32 x i32>, ptr %in.vec, align 64
666 %strided.vec0 = shufflevector <32 x i32> %wide.vec, <32 x i32> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
667 %strided.vec1 = shufflevector <32 x i32> %wide.vec, <32 x i32> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
668 store <16 x i32> %strided.vec0, ptr %out.vec0, align 64
669 store <16 x i32> %strided.vec1, ptr %out.vec1, align 64
673 define void @load_i32_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind {
674 ; SSE-LABEL: load_i32_stride2_vf32:
676 ; SSE-NEXT: movaps (%rdi), %xmm1
677 ; SSE-NEXT: movaps 16(%rdi), %xmm8
678 ; SSE-NEXT: movaps 32(%rdi), %xmm0
679 ; SSE-NEXT: movaps 208(%rdi), %xmm11
680 ; SSE-NEXT: movaps 192(%rdi), %xmm2
681 ; SSE-NEXT: movaps 240(%rdi), %xmm10
682 ; SSE-NEXT: movaps 224(%rdi), %xmm4
683 ; SSE-NEXT: movaps 144(%rdi), %xmm14
684 ; SSE-NEXT: movaps 128(%rdi), %xmm3
685 ; SSE-NEXT: movaps 176(%rdi), %xmm12
686 ; SSE-NEXT: movaps 160(%rdi), %xmm6
687 ; SSE-NEXT: movaps 80(%rdi), %xmm13
688 ; SSE-NEXT: movaps 64(%rdi), %xmm5
689 ; SSE-NEXT: movaps 112(%rdi), %xmm15
690 ; SSE-NEXT: movaps 96(%rdi), %xmm7
691 ; SSE-NEXT: movaps %xmm5, %xmm9
692 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm13[0,2]
693 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm13[1,3]
694 ; SSE-NEXT: movaps %xmm7, %xmm13
695 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm15[0,2]
696 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm15[1,3]
697 ; SSE-NEXT: movaps %xmm3, %xmm15
698 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm14[0,2]
699 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm14[1,3]
700 ; SSE-NEXT: movaps %xmm6, %xmm14
701 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm12[0,2]
702 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm12[1,3]
703 ; SSE-NEXT: movaps %xmm2, %xmm12
704 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm11[0,2]
705 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm11[1,3]
706 ; SSE-NEXT: movaps %xmm4, %xmm11
707 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm10[0,2]
708 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm10[1,3]
709 ; SSE-NEXT: movaps %xmm1, %xmm10
710 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm8[0,2]
711 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm8[1,3]
712 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
713 ; SSE-NEXT: movaps 48(%rdi), %xmm8
714 ; SSE-NEXT: movaps %xmm0, %xmm1
715 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[0,2]
716 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm8[1,3]
717 ; SSE-NEXT: movaps %xmm12, 96(%rsi)
718 ; SSE-NEXT: movaps %xmm11, 112(%rsi)
719 ; SSE-NEXT: movaps %xmm15, 64(%rsi)
720 ; SSE-NEXT: movaps %xmm14, 80(%rsi)
721 ; SSE-NEXT: movaps %xmm9, 32(%rsi)
722 ; SSE-NEXT: movaps %xmm13, 48(%rsi)
723 ; SSE-NEXT: movaps %xmm10, (%rsi)
724 ; SSE-NEXT: movaps %xmm1, 16(%rsi)
725 ; SSE-NEXT: movaps %xmm4, 112(%rdx)
726 ; SSE-NEXT: movaps %xmm2, 96(%rdx)
727 ; SSE-NEXT: movaps %xmm6, 80(%rdx)
728 ; SSE-NEXT: movaps %xmm3, 64(%rdx)
729 ; SSE-NEXT: movaps %xmm7, 48(%rdx)
730 ; SSE-NEXT: movaps %xmm5, 32(%rdx)
731 ; SSE-NEXT: movaps %xmm0, 16(%rdx)
732 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
733 ; SSE-NEXT: movaps %xmm0, (%rdx)
736 ; AVX-LABEL: load_i32_stride2_vf32:
738 ; AVX-NEXT: vmovaps (%rdi), %ymm0
739 ; AVX-NEXT: vmovaps 64(%rdi), %ymm1
740 ; AVX-NEXT: vmovaps 128(%rdi), %ymm2
741 ; AVX-NEXT: vmovaps 192(%rdi), %ymm3
742 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],mem[2,3]
743 ; AVX-NEXT: vinsertf128 $1, 224(%rdi), %ymm3, %ymm3
744 ; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,2],ymm4[0,2],ymm3[4,6],ymm4[4,6]
745 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3],mem[2,3]
746 ; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm1
747 ; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm1[0,2],ymm6[0,2],ymm1[4,6],ymm6[4,6]
748 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],mem[2,3]
749 ; AVX-NEXT: vinsertf128 $1, 32(%rdi), %ymm0, %ymm0
750 ; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm0[0,2],ymm8[0,2],ymm0[4,6],ymm8[4,6]
751 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm2[2,3],mem[2,3]
752 ; AVX-NEXT: vinsertf128 $1, 160(%rdi), %ymm2, %ymm2
753 ; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm2[0,2],ymm10[0,2],ymm2[4,6],ymm10[4,6]
754 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],ymm6[1,3],ymm1[5,7],ymm6[5,7]
755 ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,3],ymm4[1,3],ymm3[5,7],ymm4[5,7]
756 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm8[1,3],ymm0[5,7],ymm8[5,7]
757 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm10[1,3],ymm2[5,7],ymm10[5,7]
758 ; AVX-NEXT: vmovaps %ymm11, 64(%rsi)
759 ; AVX-NEXT: vmovaps %ymm9, (%rsi)
760 ; AVX-NEXT: vmovaps %ymm7, 32(%rsi)
761 ; AVX-NEXT: vmovaps %ymm5, 96(%rsi)
762 ; AVX-NEXT: vmovaps %ymm2, 64(%rdx)
763 ; AVX-NEXT: vmovaps %ymm0, (%rdx)
764 ; AVX-NEXT: vmovaps %ymm3, 96(%rdx)
765 ; AVX-NEXT: vmovaps %ymm1, 32(%rdx)
766 ; AVX-NEXT: vzeroupper
769 ; AVX2-LABEL: load_i32_stride2_vf32:
771 ; AVX2-NEXT: vmovaps (%rdi), %ymm0
772 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm1
773 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm2
774 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm3
775 ; AVX2-NEXT: vmovaps 160(%rdi), %ymm4
776 ; AVX2-NEXT: vmovaps 128(%rdi), %ymm5
777 ; AVX2-NEXT: vmovaps 224(%rdi), %ymm6
778 ; AVX2-NEXT: vmovaps 192(%rdi), %ymm7
779 ; AVX2-NEXT: vshufps {{.*#+}} ymm8 = ymm7[0,2],ymm6[0,2],ymm7[4,6],ymm6[4,6]
780 ; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,1,3]
781 ; AVX2-NEXT: vshufps {{.*#+}} ymm9 = ymm5[0,2],ymm4[0,2],ymm5[4,6],ymm4[4,6]
782 ; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,1,3]
783 ; AVX2-NEXT: vshufps {{.*#+}} ymm10 = ymm2[0,2],ymm3[0,2],ymm2[4,6],ymm3[4,6]
784 ; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,1,3]
785 ; AVX2-NEXT: vshufps {{.*#+}} ymm11 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
786 ; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,1,3]
787 ; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm7[1,3],ymm6[1,3],ymm7[5,7],ymm6[5,7]
788 ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,1,3]
789 ; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,3],ymm4[1,3],ymm5[5,7],ymm4[5,7]
790 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3]
791 ; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm3[1,3],ymm2[5,7],ymm3[5,7]
792 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
793 ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
794 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
795 ; AVX2-NEXT: vmovaps %ymm9, 64(%rsi)
796 ; AVX2-NEXT: vmovaps %ymm11, (%rsi)
797 ; AVX2-NEXT: vmovaps %ymm8, 96(%rsi)
798 ; AVX2-NEXT: vmovaps %ymm10, 32(%rsi)
799 ; AVX2-NEXT: vmovaps %ymm4, 64(%rdx)
800 ; AVX2-NEXT: vmovaps %ymm0, (%rdx)
801 ; AVX2-NEXT: vmovaps %ymm6, 96(%rdx)
802 ; AVX2-NEXT: vmovaps %ymm2, 32(%rdx)
803 ; AVX2-NEXT: vzeroupper
806 ; AVX2-FP-LABEL: load_i32_stride2_vf32:
808 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0
809 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm1
810 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm2
811 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm3
812 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm4
813 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm5
814 ; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm6
815 ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm7
816 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm8 = ymm7[0,2],ymm6[0,2],ymm7[4,6],ymm6[4,6]
817 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,1,3]
818 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm9 = ymm5[0,2],ymm4[0,2],ymm5[4,6],ymm4[4,6]
819 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,1,3]
820 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm10 = ymm2[0,2],ymm3[0,2],ymm2[4,6],ymm3[4,6]
821 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,1,3]
822 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm11 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
823 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,1,3]
824 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm7[1,3],ymm6[1,3],ymm7[5,7],ymm6[5,7]
825 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,1,3]
826 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,3],ymm4[1,3],ymm5[5,7],ymm4[5,7]
827 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3]
828 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm3[1,3],ymm2[5,7],ymm3[5,7]
829 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
830 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
831 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
832 ; AVX2-FP-NEXT: vmovaps %ymm9, 64(%rsi)
833 ; AVX2-FP-NEXT: vmovaps %ymm11, (%rsi)
834 ; AVX2-FP-NEXT: vmovaps %ymm8, 96(%rsi)
835 ; AVX2-FP-NEXT: vmovaps %ymm10, 32(%rsi)
836 ; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rdx)
837 ; AVX2-FP-NEXT: vmovaps %ymm0, (%rdx)
838 ; AVX2-FP-NEXT: vmovaps %ymm6, 96(%rdx)
839 ; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rdx)
840 ; AVX2-FP-NEXT: vzeroupper
843 ; AVX2-FCP-LABEL: load_i32_stride2_vf32:
845 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0
846 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm1
847 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm2
848 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm3
849 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm4
850 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm5
851 ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm6
852 ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm7
853 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm8 = ymm7[0,2],ymm6[0,2],ymm7[4,6],ymm6[4,6]
854 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,1,3]
855 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm9 = ymm5[0,2],ymm4[0,2],ymm5[4,6],ymm4[4,6]
856 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,1,3]
857 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm10 = ymm2[0,2],ymm3[0,2],ymm2[4,6],ymm3[4,6]
858 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,1,3]
859 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm11 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
860 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,1,3]
861 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm7[1,3],ymm6[1,3],ymm7[5,7],ymm6[5,7]
862 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,1,3]
863 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,3],ymm4[1,3],ymm5[5,7],ymm4[5,7]
864 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3]
865 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm3[1,3],ymm2[5,7],ymm3[5,7]
866 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
867 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
868 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
869 ; AVX2-FCP-NEXT: vmovaps %ymm9, 64(%rsi)
870 ; AVX2-FCP-NEXT: vmovaps %ymm11, (%rsi)
871 ; AVX2-FCP-NEXT: vmovaps %ymm8, 96(%rsi)
872 ; AVX2-FCP-NEXT: vmovaps %ymm10, 32(%rsi)
873 ; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rdx)
874 ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rdx)
875 ; AVX2-FCP-NEXT: vmovaps %ymm6, 96(%rdx)
876 ; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rdx)
877 ; AVX2-FCP-NEXT: vzeroupper
878 ; AVX2-FCP-NEXT: retq
880 ; AVX512-LABEL: load_i32_stride2_vf32:
882 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
883 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1
884 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2
885 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3
886 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
887 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5
888 ; AVX512-NEXT: vpermt2d %zmm1, %zmm4, %zmm5
889 ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4
890 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
891 ; AVX512-NEXT: vpermt2d %zmm1, %zmm6, %zmm0
892 ; AVX512-NEXT: vpermt2d %zmm3, %zmm6, %zmm2
893 ; AVX512-NEXT: vmovdqa64 %zmm4, 64(%rsi)
894 ; AVX512-NEXT: vmovdqa64 %zmm5, (%rsi)
895 ; AVX512-NEXT: vmovdqa64 %zmm2, 64(%rdx)
896 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
897 ; AVX512-NEXT: vzeroupper
900 ; AVX512-FCP-LABEL: load_i32_stride2_vf32:
901 ; AVX512-FCP: # %bb.0:
902 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
903 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
904 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
905 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
906 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
907 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5
908 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm5
909 ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4
910 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
911 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm0
912 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm2
913 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%rsi)
914 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rsi)
915 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%rdx)
916 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
917 ; AVX512-FCP-NEXT: vzeroupper
918 ; AVX512-FCP-NEXT: retq
920 ; AVX512DQ-LABEL: load_i32_stride2_vf32:
922 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0
923 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1
924 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2
925 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3
926 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
927 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5
928 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm4, %zmm5
929 ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm4
930 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
931 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm6, %zmm0
932 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm6, %zmm2
933 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rsi)
934 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rsi)
935 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rdx)
936 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
937 ; AVX512DQ-NEXT: vzeroupper
938 ; AVX512DQ-NEXT: retq
940 ; AVX512DQ-FCP-LABEL: load_i32_stride2_vf32:
941 ; AVX512DQ-FCP: # %bb.0:
942 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
943 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
944 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
945 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
946 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
947 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5
948 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm5
949 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4
950 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
951 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm0
952 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm2
953 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 64(%rsi)
954 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rsi)
955 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%rdx)
956 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
957 ; AVX512DQ-FCP-NEXT: vzeroupper
958 ; AVX512DQ-FCP-NEXT: retq
960 ; AVX512BW-LABEL: load_i32_stride2_vf32:
962 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
963 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
964 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2
965 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3
966 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
967 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5
968 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm5
969 ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4
970 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
971 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm6, %zmm0
972 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm2
973 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rsi)
974 ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rsi)
975 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rdx)
976 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
977 ; AVX512BW-NEXT: vzeroupper
978 ; AVX512BW-NEXT: retq
980 ; AVX512BW-FCP-LABEL: load_i32_stride2_vf32:
981 ; AVX512BW-FCP: # %bb.0:
982 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
983 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
984 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
985 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
986 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
987 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5
988 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm5
989 ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4
990 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
991 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm0
992 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm2
993 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rsi)
994 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rsi)
995 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rdx)
996 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
997 ; AVX512BW-FCP-NEXT: vzeroupper
998 ; AVX512BW-FCP-NEXT: retq
1000 ; AVX512DQ-BW-LABEL: load_i32_stride2_vf32:
1001 ; AVX512DQ-BW: # %bb.0:
1002 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
1003 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1
1004 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2
1005 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3
1006 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
1007 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5
1008 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm5
1009 ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4
1010 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
1011 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm6, %zmm0
1012 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm2
1013 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 64(%rsi)
1014 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rsi)
1015 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 64(%rdx)
1016 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rdx)
1017 ; AVX512DQ-BW-NEXT: vzeroupper
1018 ; AVX512DQ-BW-NEXT: retq
1020 ; AVX512DQ-BW-FCP-LABEL: load_i32_stride2_vf32:
1021 ; AVX512DQ-BW-FCP: # %bb.0:
1022 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1023 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
1024 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
1025 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
1026 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
1027 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5
1028 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm5
1029 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4
1030 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
1031 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm0
1032 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm2
1033 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rsi)
1034 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rsi)
1035 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rdx)
1036 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
1037 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
1038 ; AVX512DQ-BW-FCP-NEXT: retq
1039 %wide.vec = load <64 x i32>, ptr %in.vec, align 64
1040 %strided.vec0 = shufflevector <64 x i32> %wide.vec, <64 x i32> poison, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
1041 %strided.vec1 = shufflevector <64 x i32> %wide.vec, <64 x i32> poison, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
1042 store <32 x i32> %strided.vec0, ptr %out.vec0, align 64
1043 store <32 x i32> %strided.vec1, ptr %out.vec1, align 64
1047 define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind {
1048 ; SSE-LABEL: load_i32_stride2_vf64:
1050 ; SSE-NEXT: subq $152, %rsp
1051 ; SSE-NEXT: movaps 208(%rdi), %xmm11
1052 ; SSE-NEXT: movaps 192(%rdi), %xmm6
1053 ; SSE-NEXT: movaps 80(%rdi), %xmm1
1054 ; SSE-NEXT: movaps 64(%rdi), %xmm5
1055 ; SSE-NEXT: movaps 240(%rdi), %xmm14
1056 ; SSE-NEXT: movaps 224(%rdi), %xmm8
1057 ; SSE-NEXT: movaps 112(%rdi), %xmm3
1058 ; SSE-NEXT: movaps 96(%rdi), %xmm7
1059 ; SSE-NEXT: movaps 272(%rdi), %xmm12
1060 ; SSE-NEXT: movaps 144(%rdi), %xmm2
1061 ; SSE-NEXT: movaps 128(%rdi), %xmm9
1062 ; SSE-NEXT: movaps 304(%rdi), %xmm0
1063 ; SSE-NEXT: movaps 288(%rdi), %xmm13
1064 ; SSE-NEXT: movaps 176(%rdi), %xmm4
1065 ; SSE-NEXT: movaps 160(%rdi), %xmm10
1066 ; SSE-NEXT: movaps %xmm7, %xmm15
1067 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm3[0,2]
1068 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1069 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm3[1,3]
1070 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1071 ; SSE-NEXT: movaps %xmm5, %xmm3
1072 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[0,2]
1073 ; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill
1074 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm1[1,3]
1075 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1076 ; SSE-NEXT: movaps %xmm10, %xmm1
1077 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2]
1078 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1079 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,3],xmm4[1,3]
1080 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1081 ; SSE-NEXT: movaps %xmm9, %xmm1
1082 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
1083 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1084 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,3],xmm2[1,3]
1085 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1086 ; SSE-NEXT: movaps %xmm8, %xmm1
1087 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm14[0,2]
1088 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1089 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,3],xmm14[1,3]
1090 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1091 ; SSE-NEXT: movaps %xmm6, %xmm1
1092 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm11[0,2]
1093 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1094 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm11[1,3]
1095 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1096 ; SSE-NEXT: movaps %xmm13, %xmm1
1097 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2]
1098 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1099 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,3],xmm0[1,3]
1100 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1101 ; SSE-NEXT: movaps 256(%rdi), %xmm0
1102 ; SSE-NEXT: movaps %xmm0, %xmm1
1103 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm12[0,2]
1104 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1105 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm12[1,3]
1106 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1107 ; SSE-NEXT: movaps 368(%rdi), %xmm0
1108 ; SSE-NEXT: movaps 352(%rdi), %xmm15
1109 ; SSE-NEXT: movaps %xmm15, %xmm1
1110 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2]
1111 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1112 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,3],xmm0[1,3]
1113 ; SSE-NEXT: movaps 336(%rdi), %xmm0
1114 ; SSE-NEXT: movaps 320(%rdi), %xmm13
1115 ; SSE-NEXT: movaps %xmm13, %xmm11
1116 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2]
1117 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,3],xmm0[1,3]
1118 ; SSE-NEXT: movaps 432(%rdi), %xmm0
1119 ; SSE-NEXT: movaps 416(%rdi), %xmm12
1120 ; SSE-NEXT: movaps %xmm12, %xmm14
1121 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[0,2]
1122 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,3],xmm0[1,3]
1123 ; SSE-NEXT: movaps 400(%rdi), %xmm0
1124 ; SSE-NEXT: movaps 384(%rdi), %xmm9
1125 ; SSE-NEXT: movaps %xmm9, %xmm10
1126 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,2]
1127 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,3],xmm0[1,3]
1128 ; SSE-NEXT: movaps 496(%rdi), %xmm0
1129 ; SSE-NEXT: movaps 480(%rdi), %xmm7
1130 ; SSE-NEXT: movaps %xmm7, %xmm6
1131 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[0,2]
1132 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm0[1,3]
1133 ; SSE-NEXT: movaps 464(%rdi), %xmm1
1134 ; SSE-NEXT: movaps 448(%rdi), %xmm3
1135 ; SSE-NEXT: movaps %xmm3, %xmm2
1136 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
1137 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm1[1,3]
1138 ; SSE-NEXT: movaps 32(%rdi), %xmm8
1139 ; SSE-NEXT: movaps 48(%rdi), %xmm1
1140 ; SSE-NEXT: movaps %xmm8, %xmm5
1141 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2]
1142 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,3],xmm1[1,3]
1143 ; SSE-NEXT: movaps (%rdi), %xmm4
1144 ; SSE-NEXT: movaps 16(%rdi), %xmm0
1145 ; SSE-NEXT: movaps %xmm4, %xmm1
1146 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2]
1147 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm0[1,3]
1148 ; SSE-NEXT: movaps %xmm2, 224(%rsi)
1149 ; SSE-NEXT: movaps %xmm11, 160(%rsi)
1150 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1151 ; SSE-NEXT: movaps %xmm0, 96(%rsi)
1152 ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
1153 ; SSE-NEXT: movaps %xmm0, 32(%rsi)
1154 ; SSE-NEXT: movaps %xmm6, 240(%rsi)
1155 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1156 ; SSE-NEXT: movaps %xmm0, 176(%rsi)
1157 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1158 ; SSE-NEXT: movaps %xmm0, 112(%rsi)
1159 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1160 ; SSE-NEXT: movaps %xmm0, 48(%rsi)
1161 ; SSE-NEXT: movaps %xmm10, 192(%rsi)
1162 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1163 ; SSE-NEXT: movaps %xmm0, 128(%rsi)
1164 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1165 ; SSE-NEXT: movaps %xmm0, 64(%rsi)
1166 ; SSE-NEXT: movaps %xmm1, (%rsi)
1167 ; SSE-NEXT: movaps %xmm14, 208(%rsi)
1168 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1169 ; SSE-NEXT: movaps %xmm0, 144(%rsi)
1170 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1171 ; SSE-NEXT: movaps %xmm0, 80(%rsi)
1172 ; SSE-NEXT: movaps %xmm5, 16(%rsi)
1173 ; SSE-NEXT: movaps %xmm3, 224(%rdx)
1174 ; SSE-NEXT: movaps %xmm7, 240(%rdx)
1175 ; SSE-NEXT: movaps %xmm9, 192(%rdx)
1176 ; SSE-NEXT: movaps %xmm12, 208(%rdx)
1177 ; SSE-NEXT: movaps %xmm13, 160(%rdx)
1178 ; SSE-NEXT: movaps %xmm15, 176(%rdx)
1179 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1180 ; SSE-NEXT: movaps %xmm0, 128(%rdx)
1181 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1182 ; SSE-NEXT: movaps %xmm0, 144(%rdx)
1183 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1184 ; SSE-NEXT: movaps %xmm0, 96(%rdx)
1185 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1186 ; SSE-NEXT: movaps %xmm0, 112(%rdx)
1187 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1188 ; SSE-NEXT: movaps %xmm0, 64(%rdx)
1189 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1190 ; SSE-NEXT: movaps %xmm0, 80(%rdx)
1191 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1192 ; SSE-NEXT: movaps %xmm0, 32(%rdx)
1193 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1194 ; SSE-NEXT: movaps %xmm0, 48(%rdx)
1195 ; SSE-NEXT: movaps %xmm4, (%rdx)
1196 ; SSE-NEXT: movaps %xmm8, 16(%rdx)
1197 ; SSE-NEXT: addq $152, %rsp
1200 ; AVX-LABEL: load_i32_stride2_vf64:
1202 ; AVX-NEXT: vmovaps 384(%rdi), %ymm4
1203 ; AVX-NEXT: vmovaps 256(%rdi), %ymm6
1204 ; AVX-NEXT: vmovaps 320(%rdi), %ymm5
1205 ; AVX-NEXT: vmovaps (%rdi), %ymm1
1206 ; AVX-NEXT: vmovaps 64(%rdi), %ymm2
1207 ; AVX-NEXT: vmovaps 128(%rdi), %ymm9
1208 ; AVX-NEXT: vmovaps 192(%rdi), %ymm3
1209 ; AVX-NEXT: vmovaps 448(%rdi), %ymm0
1210 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],mem[2,3]
1211 ; AVX-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm10
1212 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,2],ymm8[0,2],ymm10[4,6],ymm8[4,6]
1213 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1214 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3],mem[2,3]
1215 ; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm2, %ymm11
1216 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,2],ymm7[0,2],ymm11[4,6],ymm7[4,6]
1217 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm3[2,3],mem[2,3]
1218 ; AVX-NEXT: vinsertf128 $1, 224(%rdi), %ymm3, %ymm13
1219 ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm13[0,2],ymm12[0,2],ymm13[4,6],ymm12[4,6]
1220 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm5[2,3],mem[2,3]
1221 ; AVX-NEXT: vinsertf128 $1, 352(%rdi), %ymm5, %ymm15
1222 ; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,2],ymm14[0,2],ymm15[4,6],ymm14[4,6]
1223 ; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm11[1,3],ymm7[1,3],ymm11[5,7],ymm7[5,7]
1224 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm9[2,3],mem[2,3]
1225 ; AVX-NEXT: vinsertf128 $1, 160(%rdi), %ymm9, %ymm9
1226 ; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm13[1,3],ymm12[1,3],ymm13[5,7],ymm12[5,7]
1227 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm6[2,3],mem[2,3]
1228 ; AVX-NEXT: vinsertf128 $1, 288(%rdi), %ymm6, %ymm6
1229 ; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm15[1,3],ymm14[1,3],ymm15[5,7],ymm14[5,7]
1230 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm4[2,3],mem[2,3]
1231 ; AVX-NEXT: vinsertf128 $1, 416(%rdi), %ymm4, %ymm4
1232 ; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm10[1,3],ymm8[1,3],ymm10[5,7],ymm8[5,7]
1233 ; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm4[0,2],ymm15[0,2],ymm4[4,6],ymm15[4,6]
1234 ; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,3],ymm15[1,3],ymm4[5,7],ymm15[5,7]
1235 ; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm6[0,2],ymm13[0,2],ymm6[4,6],ymm13[4,6]
1236 ; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3],ymm13[1,3],ymm6[5,7],ymm13[5,7]
1237 ; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,2],ymm11[0,2],ymm9[4,6],ymm11[4,6]
1238 ; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,3],ymm11[1,3],ymm9[5,7],ymm11[5,7]
1239 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm1[2,3],mem[2,3]
1240 ; AVX-NEXT: vinsertf128 $1, 32(%rdi), %ymm1, %ymm1
1241 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm11[0,2],ymm1[4,6],ymm11[4,6]
1242 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],ymm11[1,3],ymm1[5,7],ymm11[5,7]
1243 ; AVX-NEXT: vmovaps %ymm10, 192(%rsi)
1244 ; AVX-NEXT: vmovaps %ymm15, 128(%rsi)
1245 ; AVX-NEXT: vmovaps %ymm13, 64(%rsi)
1246 ; AVX-NEXT: vmovaps %ymm0, (%rsi)
1247 ; AVX-NEXT: vmovaps %ymm5, 160(%rsi)
1248 ; AVX-NEXT: vmovaps %ymm3, 96(%rsi)
1249 ; AVX-NEXT: vmovaps %ymm2, 32(%rsi)
1250 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1251 ; AVX-NEXT: vmovaps %ymm0, 224(%rsi)
1252 ; AVX-NEXT: vmovaps %ymm1, (%rdx)
1253 ; AVX-NEXT: vmovaps %ymm9, 64(%rdx)
1254 ; AVX-NEXT: vmovaps %ymm6, 128(%rdx)
1255 ; AVX-NEXT: vmovaps %ymm4, 192(%rdx)
1256 ; AVX-NEXT: vmovaps %ymm8, 224(%rdx)
1257 ; AVX-NEXT: vmovaps %ymm14, 160(%rdx)
1258 ; AVX-NEXT: vmovaps %ymm12, 96(%rdx)
1259 ; AVX-NEXT: vmovaps %ymm7, 32(%rdx)
1260 ; AVX-NEXT: vzeroupper
1263 ; AVX2-LABEL: load_i32_stride2_vf64:
1265 ; AVX2-NEXT: vmovaps (%rdi), %ymm1
1266 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm3
1267 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm4
1268 ; AVX2-NEXT: vmovaps 416(%rdi), %ymm10
1269 ; AVX2-NEXT: vmovaps 384(%rdi), %ymm12
1270 ; AVX2-NEXT: vmovaps 288(%rdi), %ymm8
1271 ; AVX2-NEXT: vmovaps 256(%rdi), %ymm11
1272 ; AVX2-NEXT: vmovaps 160(%rdi), %ymm7
1273 ; AVX2-NEXT: vmovaps 128(%rdi), %ymm9
1274 ; AVX2-NEXT: vmovaps 480(%rdi), %ymm6
1275 ; AVX2-NEXT: vmovaps 448(%rdi), %ymm14
1276 ; AVX2-NEXT: vmovaps 352(%rdi), %ymm5
1277 ; AVX2-NEXT: vmovaps 320(%rdi), %ymm15
1278 ; AVX2-NEXT: vmovaps 224(%rdi), %ymm2
1279 ; AVX2-NEXT: vmovaps 192(%rdi), %ymm13
1280 ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm13[0,2],ymm2[0,2],ymm13[4,6],ymm2[4,6]
1281 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1282 ; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm13[1,3],ymm2[1,3],ymm13[5,7],ymm2[5,7]
1283 ; AVX2-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,2],ymm5[0,2],ymm15[4,6],ymm5[4,6]
1284 ; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm15[1,3],ymm5[1,3],ymm15[5,7],ymm5[5,7]
1285 ; AVX2-NEXT: vshufps {{.*#+}} ymm15 = ymm14[0,2],ymm6[0,2],ymm14[4,6],ymm6[4,6]
1286 ; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm14[1,3],ymm6[1,3],ymm14[5,7],ymm6[5,7]
1287 ; AVX2-NEXT: vshufps {{.*#+}} ymm14 = ymm12[0,2],ymm10[0,2],ymm12[4,6],ymm10[4,6]
1288 ; AVX2-NEXT: vshufps {{.*#+}} ymm10 = ymm12[1,3],ymm10[1,3],ymm12[5,7],ymm10[5,7]
1289 ; AVX2-NEXT: vshufps {{.*#+}} ymm12 = ymm11[0,2],ymm8[0,2],ymm11[4,6],ymm8[4,6]
1290 ; AVX2-NEXT: vshufps {{.*#+}} ymm8 = ymm11[1,3],ymm8[1,3],ymm11[5,7],ymm8[5,7]
1291 ; AVX2-NEXT: vshufps {{.*#+}} ymm11 = ymm9[0,2],ymm7[0,2],ymm9[4,6],ymm7[4,6]
1292 ; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm9[1,3],ymm7[1,3],ymm9[5,7],ymm7[5,7]
1293 ; AVX2-NEXT: vshufps {{.*#+}} ymm9 = ymm3[0,2],ymm4[0,2],ymm3[4,6],ymm4[4,6]
1294 ; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,3],ymm4[1,3],ymm3[5,7],ymm4[5,7]
1295 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm4
1296 ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm4[0,2],ymm1[4,6],ymm4[4,6]
1297 ; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],ymm4[1,3],ymm1[5,7],ymm4[5,7]
1298 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm14[0,2,1,3]
1299 ; AVX2-NEXT: vmovaps %ymm4, 192(%rsi)
1300 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm12[0,2,1,3]
1301 ; AVX2-NEXT: vmovaps %ymm4, 128(%rsi)
1302 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm11[0,2,1,3]
1303 ; AVX2-NEXT: vmovaps %ymm4, 64(%rsi)
1304 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
1305 ; AVX2-NEXT: vmovaps %ymm0, (%rsi)
1306 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm15[0,2,1,3]
1307 ; AVX2-NEXT: vmovaps %ymm0, 224(%rsi)
1308 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,2,1,3]
1309 ; AVX2-NEXT: vmovaps %ymm0, 160(%rsi)
1310 ; AVX2-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
1311 ; AVX2-NEXT: # ymm0 = mem[0,2,1,3]
1312 ; AVX2-NEXT: vmovaps %ymm0, 96(%rsi)
1313 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm9[0,2,1,3]
1314 ; AVX2-NEXT: vmovaps %ymm0, 32(%rsi)
1315 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,2,1,3]
1316 ; AVX2-NEXT: vmovaps %ymm0, (%rdx)
1317 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm7[0,2,1,3]
1318 ; AVX2-NEXT: vmovaps %ymm0, 64(%rdx)
1319 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm8[0,2,1,3]
1320 ; AVX2-NEXT: vmovaps %ymm0, 128(%rdx)
1321 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,2,1,3]
1322 ; AVX2-NEXT: vmovaps %ymm0, 192(%rdx)
1323 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,1,3]
1324 ; AVX2-NEXT: vmovaps %ymm0, 224(%rdx)
1325 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm5[0,2,1,3]
1326 ; AVX2-NEXT: vmovaps %ymm0, 160(%rdx)
1327 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,2,1,3]
1328 ; AVX2-NEXT: vmovaps %ymm0, 96(%rdx)
1329 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,2,1,3]
1330 ; AVX2-NEXT: vmovaps %ymm0, 32(%rdx)
1331 ; AVX2-NEXT: vzeroupper
1334 ; AVX2-FP-LABEL: load_i32_stride2_vf64:
1336 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm1
1337 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm3
1338 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm4
1339 ; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm10
1340 ; AVX2-FP-NEXT: vmovaps 384(%rdi), %ymm12
1341 ; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm8
1342 ; AVX2-FP-NEXT: vmovaps 256(%rdi), %ymm11
1343 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm7
1344 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm9
1345 ; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm6
1346 ; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm14
1347 ; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm5
1348 ; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm15
1349 ; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm2
1350 ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm13
1351 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm13[0,2],ymm2[0,2],ymm13[4,6],ymm2[4,6]
1352 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1353 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm13[1,3],ymm2[1,3],ymm13[5,7],ymm2[5,7]
1354 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,2],ymm5[0,2],ymm15[4,6],ymm5[4,6]
1355 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm15[1,3],ymm5[1,3],ymm15[5,7],ymm5[5,7]
1356 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm15 = ymm14[0,2],ymm6[0,2],ymm14[4,6],ymm6[4,6]
1357 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm14[1,3],ymm6[1,3],ymm14[5,7],ymm6[5,7]
1358 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm14 = ymm12[0,2],ymm10[0,2],ymm12[4,6],ymm10[4,6]
1359 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm10 = ymm12[1,3],ymm10[1,3],ymm12[5,7],ymm10[5,7]
1360 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm12 = ymm11[0,2],ymm8[0,2],ymm11[4,6],ymm8[4,6]
1361 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm8 = ymm11[1,3],ymm8[1,3],ymm11[5,7],ymm8[5,7]
1362 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm11 = ymm9[0,2],ymm7[0,2],ymm9[4,6],ymm7[4,6]
1363 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm9[1,3],ymm7[1,3],ymm9[5,7],ymm7[5,7]
1364 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm9 = ymm3[0,2],ymm4[0,2],ymm3[4,6],ymm4[4,6]
1365 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,3],ymm4[1,3],ymm3[5,7],ymm4[5,7]
1366 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm4
1367 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm4[0,2],ymm1[4,6],ymm4[4,6]
1368 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],ymm4[1,3],ymm1[5,7],ymm4[5,7]
1369 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm14[0,2,1,3]
1370 ; AVX2-FP-NEXT: vmovaps %ymm4, 192(%rsi)
1371 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm12[0,2,1,3]
1372 ; AVX2-FP-NEXT: vmovaps %ymm4, 128(%rsi)
1373 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm11[0,2,1,3]
1374 ; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rsi)
1375 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
1376 ; AVX2-FP-NEXT: vmovaps %ymm0, (%rsi)
1377 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm15[0,2,1,3]
1378 ; AVX2-FP-NEXT: vmovaps %ymm0, 224(%rsi)
1379 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,2,1,3]
1380 ; AVX2-FP-NEXT: vmovaps %ymm0, 160(%rsi)
1381 ; AVX2-FP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
1382 ; AVX2-FP-NEXT: # ymm0 = mem[0,2,1,3]
1383 ; AVX2-FP-NEXT: vmovaps %ymm0, 96(%rsi)
1384 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm9[0,2,1,3]
1385 ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rsi)
1386 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,2,1,3]
1387 ; AVX2-FP-NEXT: vmovaps %ymm0, (%rdx)
1388 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm7[0,2,1,3]
1389 ; AVX2-FP-NEXT: vmovaps %ymm0, 64(%rdx)
1390 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm8[0,2,1,3]
1391 ; AVX2-FP-NEXT: vmovaps %ymm0, 128(%rdx)
1392 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,2,1,3]
1393 ; AVX2-FP-NEXT: vmovaps %ymm0, 192(%rdx)
1394 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,1,3]
1395 ; AVX2-FP-NEXT: vmovaps %ymm0, 224(%rdx)
1396 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm5[0,2,1,3]
1397 ; AVX2-FP-NEXT: vmovaps %ymm0, 160(%rdx)
1398 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,2,1,3]
1399 ; AVX2-FP-NEXT: vmovaps %ymm0, 96(%rdx)
1400 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,2,1,3]
1401 ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rdx)
1402 ; AVX2-FP-NEXT: vzeroupper
1403 ; AVX2-FP-NEXT: retq
1405 ; AVX2-FCP-LABEL: load_i32_stride2_vf64:
1406 ; AVX2-FCP: # %bb.0:
1407 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm1
1408 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm3
1409 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm4
1410 ; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm10
1411 ; AVX2-FCP-NEXT: vmovaps 384(%rdi), %ymm12
1412 ; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm8
1413 ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm11
1414 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm7
1415 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm9
1416 ; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm6
1417 ; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm14
1418 ; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm5
1419 ; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm15
1420 ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm2
1421 ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm13
1422 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm13[0,2],ymm2[0,2],ymm13[4,6],ymm2[4,6]
1423 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1424 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm13[1,3],ymm2[1,3],ymm13[5,7],ymm2[5,7]
1425 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,2],ymm5[0,2],ymm15[4,6],ymm5[4,6]
1426 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm15[1,3],ymm5[1,3],ymm15[5,7],ymm5[5,7]
1427 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm15 = ymm14[0,2],ymm6[0,2],ymm14[4,6],ymm6[4,6]
1428 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm14[1,3],ymm6[1,3],ymm14[5,7],ymm6[5,7]
1429 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm14 = ymm12[0,2],ymm10[0,2],ymm12[4,6],ymm10[4,6]
1430 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm10 = ymm12[1,3],ymm10[1,3],ymm12[5,7],ymm10[5,7]
1431 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm12 = ymm11[0,2],ymm8[0,2],ymm11[4,6],ymm8[4,6]
1432 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm8 = ymm11[1,3],ymm8[1,3],ymm11[5,7],ymm8[5,7]
1433 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm11 = ymm9[0,2],ymm7[0,2],ymm9[4,6],ymm7[4,6]
1434 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm7 = ymm9[1,3],ymm7[1,3],ymm9[5,7],ymm7[5,7]
1435 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm9 = ymm3[0,2],ymm4[0,2],ymm3[4,6],ymm4[4,6]
1436 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,3],ymm4[1,3],ymm3[5,7],ymm4[5,7]
1437 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm4
1438 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm4[0,2],ymm1[4,6],ymm4[4,6]
1439 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],ymm4[1,3],ymm1[5,7],ymm4[5,7]
1440 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm14[0,2,1,3]
1441 ; AVX2-FCP-NEXT: vmovaps %ymm4, 192(%rsi)
1442 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm12[0,2,1,3]
1443 ; AVX2-FCP-NEXT: vmovaps %ymm4, 128(%rsi)
1444 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm11[0,2,1,3]
1445 ; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rsi)
1446 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
1447 ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rsi)
1448 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm15[0,2,1,3]
1449 ; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%rsi)
1450 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,2,1,3]
1451 ; AVX2-FCP-NEXT: vmovaps %ymm0, 160(%rsi)
1452 ; AVX2-FCP-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
1453 ; AVX2-FCP-NEXT: # ymm0 = mem[0,2,1,3]
1454 ; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%rsi)
1455 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm9[0,2,1,3]
1456 ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rsi)
1457 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,2,1,3]
1458 ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rdx)
1459 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm7[0,2,1,3]
1460 ; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rdx)
1461 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm8[0,2,1,3]
1462 ; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%rdx)
1463 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,2,1,3]
1464 ; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%rdx)
1465 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,1,3]
1466 ; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%rdx)
1467 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm5[0,2,1,3]
1468 ; AVX2-FCP-NEXT: vmovaps %ymm0, 160(%rdx)
1469 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,2,1,3]
1470 ; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%rdx)
1471 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,2,1,3]
1472 ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rdx)
1473 ; AVX2-FCP-NEXT: vzeroupper
1474 ; AVX2-FCP-NEXT: retq
1476 ; AVX512-LABEL: load_i32_stride2_vf64:
1478 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
1479 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1
1480 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2
1481 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3
1482 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm4
1483 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm5
1484 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm6
1485 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm7
1486 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
1487 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm9
1488 ; AVX512-NEXT: vpermt2d %zmm6, %zmm8, %zmm9
1489 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm10
1490 ; AVX512-NEXT: vpermt2d %zmm4, %zmm8, %zmm10
1491 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm11
1492 ; AVX512-NEXT: vpermt2d %zmm3, %zmm8, %zmm11
1493 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm8
1494 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
1495 ; AVX512-NEXT: vpermt2d %zmm4, %zmm12, %zmm5
1496 ; AVX512-NEXT: vpermt2d %zmm6, %zmm12, %zmm7
1497 ; AVX512-NEXT: vpermt2d %zmm3, %zmm12, %zmm2
1498 ; AVX512-NEXT: vpermt2d %zmm1, %zmm12, %zmm0
1499 ; AVX512-NEXT: vmovdqa64 %zmm10, 192(%rsi)
1500 ; AVX512-NEXT: vmovdqa64 %zmm8, (%rsi)
1501 ; AVX512-NEXT: vmovdqa64 %zmm11, 64(%rsi)
1502 ; AVX512-NEXT: vmovdqa64 %zmm9, 128(%rsi)
1503 ; AVX512-NEXT: vmovdqa64 %zmm7, 128(%rdx)
1504 ; AVX512-NEXT: vmovdqa64 %zmm5, 192(%rdx)
1505 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
1506 ; AVX512-NEXT: vmovdqa64 %zmm2, 64(%rdx)
1507 ; AVX512-NEXT: vzeroupper
1510 ; AVX512-FCP-LABEL: load_i32_stride2_vf64:
1511 ; AVX512-FCP: # %bb.0:
1512 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1513 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
1514 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
1515 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
1516 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4
1517 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5
1518 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6
1519 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7
1520 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
1521 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9
1522 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm9
1523 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm10
1524 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm10
1525 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm11
1526 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm11
1527 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8
1528 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
1529 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm5
1530 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm7
1531 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm12, %zmm2
1532 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm12, %zmm0
1533 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 192(%rsi)
1534 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%rsi)
1535 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 64(%rsi)
1536 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 128(%rsi)
1537 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 128(%rdx)
1538 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 192(%rdx)
1539 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
1540 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%rdx)
1541 ; AVX512-FCP-NEXT: vzeroupper
1542 ; AVX512-FCP-NEXT: retq
1544 ; AVX512DQ-LABEL: load_i32_stride2_vf64:
1545 ; AVX512DQ: # %bb.0:
1546 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0
1547 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1
1548 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2
1549 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3
1550 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm4
1551 ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm5
1552 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm6
1553 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm7
1554 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
1555 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm9
1556 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm8, %zmm9
1557 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm10
1558 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm8, %zmm10
1559 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm11
1560 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm8, %zmm11
1561 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm8
1562 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
1563 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm12, %zmm5
1564 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm12, %zmm7
1565 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm12, %zmm2
1566 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm12, %zmm0
1567 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 192(%rsi)
1568 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rsi)
1569 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 64(%rsi)
1570 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 128(%rsi)
1571 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 128(%rdx)
1572 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 192(%rdx)
1573 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
1574 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rdx)
1575 ; AVX512DQ-NEXT: vzeroupper
1576 ; AVX512DQ-NEXT: retq
1578 ; AVX512DQ-FCP-LABEL: load_i32_stride2_vf64:
1579 ; AVX512DQ-FCP: # %bb.0:
1580 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1581 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
1582 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
1583 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
1584 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4
1585 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5
1586 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6
1587 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7
1588 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
1589 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm9
1590 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm9
1591 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm10
1592 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm10
1593 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm11
1594 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm11
1595 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8
1596 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
1597 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm5
1598 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm7
1599 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm12, %zmm2
1600 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm12, %zmm0
1601 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 192(%rsi)
1602 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rsi)
1603 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 64(%rsi)
1604 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 128(%rsi)
1605 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 128(%rdx)
1606 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 192(%rdx)
1607 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
1608 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%rdx)
1609 ; AVX512DQ-FCP-NEXT: vzeroupper
1610 ; AVX512DQ-FCP-NEXT: retq
1612 ; AVX512BW-LABEL: load_i32_stride2_vf64:
1613 ; AVX512BW: # %bb.0:
1614 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1615 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
1616 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2
1617 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3
1618 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm4
1619 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm5
1620 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm6
1621 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm7
1622 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
1623 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9
1624 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm9
1625 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10
1626 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm10
1627 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11
1628 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm8, %zmm11
1629 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8
1630 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
1631 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm12, %zmm5
1632 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm12, %zmm7
1633 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm12, %zmm2
1634 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm12, %zmm0
1635 ; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%rsi)
1636 ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi)
1637 ; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rsi)
1638 ; AVX512BW-NEXT: vmovdqa64 %zmm9, 128(%rsi)
1639 ; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%rdx)
1640 ; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rdx)
1641 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
1642 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rdx)
1643 ; AVX512BW-NEXT: vzeroupper
1644 ; AVX512BW-NEXT: retq
1646 ; AVX512BW-FCP-LABEL: load_i32_stride2_vf64:
1647 ; AVX512BW-FCP: # %bb.0:
1648 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1649 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
1650 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
1651 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
1652 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4
1653 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5
1654 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6
1655 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7
1656 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
1657 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9
1658 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm9
1659 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10
1660 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm10
1661 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11
1662 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm11
1663 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8
1664 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
1665 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm5
1666 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm7
1667 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm12, %zmm2
1668 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm12, %zmm0
1669 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%rsi)
1670 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi)
1671 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 64(%rsi)
1672 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 128(%rsi)
1673 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 128(%rdx)
1674 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 192(%rdx)
1675 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
1676 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rdx)
1677 ; AVX512BW-FCP-NEXT: vzeroupper
1678 ; AVX512BW-FCP-NEXT: retq
1680 ; AVX512DQ-BW-LABEL: load_i32_stride2_vf64:
1681 ; AVX512DQ-BW: # %bb.0:
1682 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
1683 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1
1684 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2
1685 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3
1686 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm4
1687 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm5
1688 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm6
1689 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm7
1690 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
1691 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9
1692 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm9
1693 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm10
1694 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm10
1695 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm11
1696 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm8, %zmm11
1697 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8
1698 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
1699 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm12, %zmm5
1700 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm12, %zmm7
1701 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm12, %zmm2
1702 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm12, %zmm0
1703 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 192(%rsi)
1704 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rsi)
1705 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 64(%rsi)
1706 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 128(%rsi)
1707 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 128(%rdx)
1708 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 192(%rdx)
1709 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rdx)
1710 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 64(%rdx)
1711 ; AVX512DQ-BW-NEXT: vzeroupper
1712 ; AVX512DQ-BW-NEXT: retq
1714 ; AVX512DQ-BW-FCP-LABEL: load_i32_stride2_vf64:
1715 ; AVX512DQ-BW-FCP: # %bb.0:
1716 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1717 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
1718 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
1719 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
1720 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4
1721 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5
1722 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6
1723 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7
1724 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
1725 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9
1726 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm9
1727 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10
1728 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm10
1729 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11
1730 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm11
1731 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8
1732 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
1733 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm5
1734 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm7
1735 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm12, %zmm2
1736 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm12, %zmm0
1737 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%rsi)
1738 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi)
1739 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 64(%rsi)
1740 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 128(%rsi)
1741 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 128(%rdx)
1742 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 192(%rdx)
1743 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
1744 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rdx)
1745 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
1746 ; AVX512DQ-BW-FCP-NEXT: retq
1747 %wide.vec = load <128 x i32>, ptr %in.vec, align 64
1748 %strided.vec0 = shufflevector <128 x i32> %wide.vec, <128 x i32> poison, <64 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62, i32 64, i32 66, i32 68, i32 70, i32 72, i32 74, i32 76, i32 78, i32 80, i32 82, i32 84, i32 86, i32 88, i32 90, i32 92, i32 94, i32 96, i32 98, i32 100, i32 102, i32 104, i32 106, i32 108, i32 110, i32 112, i32 114, i32 116, i32 118, i32 120, i32 122, i32 124, i32 126>
1749 %strided.vec1 = shufflevector <128 x i32> %wide.vec, <128 x i32> poison, <64 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63, i32 65, i32 67, i32 69, i32 71, i32 73, i32 75, i32 77, i32 79, i32 81, i32 83, i32 85, i32 87, i32 89, i32 91, i32 93, i32 95, i32 97, i32 99, i32 101, i32 103, i32 105, i32 107, i32 109, i32 111, i32 113, i32 115, i32 117, i32 119, i32 121, i32 123, i32 125, i32 127>
1750 store <64 x i32> %strided.vec0, ptr %out.vec0, align 64
1751 store <64 x i32> %strided.vec1, ptr %out.vec1, align 64