1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
16 ; These patterns are produced by LoopVectorizer for interleaved loads.
18 define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
19 ; SSE-LABEL: load_i32_stride3_vf2:
21 ; SSE-NEXT: movdqa (%rdi), %xmm0
22 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
23 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
24 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
25 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
26 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
27 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
28 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
29 ; SSE-NEXT: movq %xmm2, (%rsi)
30 ; SSE-NEXT: movq %xmm3, (%rdx)
31 ; SSE-NEXT: movq %xmm0, (%rcx)
34 ; AVX-LABEL: load_i32_stride3_vf2:
36 ; AVX-NEXT: vmovaps (%rdi), %xmm0
37 ; AVX-NEXT: vmovaps 16(%rdi), %xmm1
38 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
39 ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0],xmm0[1],xmm1[2,3]
40 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0,2,3]
41 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
42 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
43 ; AVX-NEXT: vmovlps %xmm2, (%rsi)
44 ; AVX-NEXT: vmovlps %xmm3, (%rdx)
45 ; AVX-NEXT: vmovlps %xmm0, (%rcx)
48 ; AVX2-LABEL: load_i32_stride3_vf2:
50 ; AVX2-NEXT: vmovaps (%rdi), %xmm0
51 ; AVX2-NEXT: vmovaps 16(%rdi), %xmm1
52 ; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
53 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
54 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
55 ; AVX2-NEXT: vbroadcastss 8(%rdi), %xmm3
56 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
57 ; AVX2-NEXT: vmovlps %xmm2, (%rsi)
58 ; AVX2-NEXT: vmovlps %xmm0, (%rdx)
59 ; AVX2-NEXT: vmovlps %xmm1, (%rcx)
62 ; AVX2-FP-LABEL: load_i32_stride3_vf2:
64 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm0
65 ; AVX2-FP-NEXT: vmovaps 16(%rdi), %xmm1
66 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
67 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
68 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
69 ; AVX2-FP-NEXT: vbroadcastss 8(%rdi), %xmm3
70 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
71 ; AVX2-FP-NEXT: vmovlps %xmm2, (%rsi)
72 ; AVX2-FP-NEXT: vmovlps %xmm0, (%rdx)
73 ; AVX2-FP-NEXT: vmovlps %xmm1, (%rcx)
76 ; AVX2-FCP-LABEL: load_i32_stride3_vf2:
78 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm0
79 ; AVX2-FCP-NEXT: vmovaps 16(%rdi), %xmm1
80 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
81 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
82 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
83 ; AVX2-FCP-NEXT: vbroadcastss 8(%rdi), %xmm3
84 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
85 ; AVX2-FCP-NEXT: vmovlps %xmm2, (%rsi)
86 ; AVX2-FCP-NEXT: vmovlps %xmm0, (%rdx)
87 ; AVX2-FCP-NEXT: vmovlps %xmm1, (%rcx)
90 ; AVX512-LABEL: load_i32_stride3_vf2:
92 ; AVX512-NEXT: vmovaps (%rdi), %xmm0
93 ; AVX512-NEXT: vmovaps 16(%rdi), %xmm1
94 ; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
95 ; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
96 ; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
97 ; AVX512-NEXT: vbroadcastss 8(%rdi), %xmm3
98 ; AVX512-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
99 ; AVX512-NEXT: vmovlps %xmm2, (%rsi)
100 ; AVX512-NEXT: vmovlps %xmm0, (%rdx)
101 ; AVX512-NEXT: vmovlps %xmm1, (%rcx)
104 ; AVX512-FCP-LABEL: load_i32_stride3_vf2:
105 ; AVX512-FCP: # %bb.0:
106 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
107 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
108 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
109 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [5,0]
110 ; AVX512-FCP-NEXT: vpermi2d %xmm0, %xmm1, %xmm3
111 ; AVX512-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm0
112 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
113 ; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi)
114 ; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx)
115 ; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx)
116 ; AVX512-FCP-NEXT: retq
118 ; AVX512DQ-LABEL: load_i32_stride3_vf2:
120 ; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0
121 ; AVX512DQ-NEXT: vmovaps 16(%rdi), %xmm1
122 ; AVX512DQ-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
123 ; AVX512DQ-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
124 ; AVX512DQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
125 ; AVX512DQ-NEXT: vbroadcastss 8(%rdi), %xmm3
126 ; AVX512DQ-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
127 ; AVX512DQ-NEXT: vmovlps %xmm2, (%rsi)
128 ; AVX512DQ-NEXT: vmovlps %xmm0, (%rdx)
129 ; AVX512DQ-NEXT: vmovlps %xmm1, (%rcx)
130 ; AVX512DQ-NEXT: retq
132 ; AVX512DQ-FCP-LABEL: load_i32_stride3_vf2:
133 ; AVX512DQ-FCP: # %bb.0:
134 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
135 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
136 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
137 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [5,0]
138 ; AVX512DQ-FCP-NEXT: vpermi2d %xmm0, %xmm1, %xmm3
139 ; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm0
140 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
141 ; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi)
142 ; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx)
143 ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx)
144 ; AVX512DQ-FCP-NEXT: retq
146 ; AVX512BW-LABEL: load_i32_stride3_vf2:
148 ; AVX512BW-NEXT: vmovaps (%rdi), %xmm0
149 ; AVX512BW-NEXT: vmovaps 16(%rdi), %xmm1
150 ; AVX512BW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
151 ; AVX512BW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
152 ; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
153 ; AVX512BW-NEXT: vbroadcastss 8(%rdi), %xmm3
154 ; AVX512BW-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
155 ; AVX512BW-NEXT: vmovlps %xmm2, (%rsi)
156 ; AVX512BW-NEXT: vmovlps %xmm0, (%rdx)
157 ; AVX512BW-NEXT: vmovlps %xmm1, (%rcx)
158 ; AVX512BW-NEXT: retq
160 ; AVX512BW-FCP-LABEL: load_i32_stride3_vf2:
161 ; AVX512BW-FCP: # %bb.0:
162 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
163 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
164 ; AVX512BW-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
165 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [5,0]
166 ; AVX512BW-FCP-NEXT: vpermi2d %xmm0, %xmm1, %xmm3
167 ; AVX512BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm0
168 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
169 ; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi)
170 ; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx)
171 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx)
172 ; AVX512BW-FCP-NEXT: retq
174 ; AVX512DQ-BW-LABEL: load_i32_stride3_vf2:
175 ; AVX512DQ-BW: # %bb.0:
176 ; AVX512DQ-BW-NEXT: vmovaps (%rdi), %xmm0
177 ; AVX512DQ-BW-NEXT: vmovaps 16(%rdi), %xmm1
178 ; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
179 ; AVX512DQ-BW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
180 ; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
181 ; AVX512DQ-BW-NEXT: vbroadcastss 8(%rdi), %xmm3
182 ; AVX512DQ-BW-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
183 ; AVX512DQ-BW-NEXT: vmovlps %xmm2, (%rsi)
184 ; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%rdx)
185 ; AVX512DQ-BW-NEXT: vmovlps %xmm1, (%rcx)
186 ; AVX512DQ-BW-NEXT: retq
188 ; AVX512DQ-BW-FCP-LABEL: load_i32_stride3_vf2:
189 ; AVX512DQ-BW-FCP: # %bb.0:
190 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
191 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
192 ; AVX512DQ-BW-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
193 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [5,0]
194 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm0, %xmm1, %xmm3
195 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm0
196 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
197 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi)
198 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx)
199 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx)
200 ; AVX512DQ-BW-FCP-NEXT: retq
201 %wide.vec = load <6 x i32>, ptr %in.vec, align 64
202 %strided.vec0 = shufflevector <6 x i32> %wide.vec, <6 x i32> poison, <2 x i32> <i32 0, i32 3>
203 %strided.vec1 = shufflevector <6 x i32> %wide.vec, <6 x i32> poison, <2 x i32> <i32 1, i32 4>
204 %strided.vec2 = shufflevector <6 x i32> %wide.vec, <6 x i32> poison, <2 x i32> <i32 2, i32 5>
205 store <2 x i32> %strided.vec0, ptr %out.vec0, align 64
206 store <2 x i32> %strided.vec1, ptr %out.vec1, align 64
207 store <2 x i32> %strided.vec2, ptr %out.vec2, align 64
211 define void @load_i32_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
212 ; SSE-LABEL: load_i32_stride3_vf4:
214 ; SSE-NEXT: movdqa (%rdi), %xmm0
215 ; SSE-NEXT: movaps 16(%rdi), %xmm1
216 ; SSE-NEXT: movaps 32(%rdi), %xmm2
217 ; SSE-NEXT: movdqa %xmm0, %xmm3
218 ; SSE-NEXT: movaps %xmm1, %xmm4
219 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3]
220 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
221 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,1,1]
222 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[1,0]
223 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,2]
224 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[2,3]
225 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
226 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
227 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[0,3]
228 ; SSE-NEXT: movaps %xmm3, (%rsi)
229 ; SSE-NEXT: movaps %xmm0, (%rdx)
230 ; SSE-NEXT: movaps %xmm5, (%rcx)
233 ; AVX-LABEL: load_i32_stride3_vf4:
235 ; AVX-NEXT: vmovaps (%rdi), %xmm0
236 ; AVX-NEXT: vmovaps 16(%rdi), %xmm1
237 ; AVX-NEXT: vmovaps 32(%rdi), %xmm2
238 ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
239 ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
240 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,3,2,1]
241 ; AVX-NEXT: vmovaps 32(%rdi), %xmm4
242 ; AVX-NEXT: vblendps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
243 ; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0],xmm4[1,2],xmm1[3]
244 ; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0,3,2]
245 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
246 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
247 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,0,3]
248 ; AVX-NEXT: vmovaps %xmm3, (%rsi)
249 ; AVX-NEXT: vmovaps %xmm4, (%rdx)
250 ; AVX-NEXT: vmovaps %xmm0, (%rcx)
253 ; AVX2-LABEL: load_i32_stride3_vf4:
255 ; AVX2-NEXT: vmovaps (%rdi), %ymm0
256 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm1
257 ; AVX2-NEXT: vmovaps {{.*#+}} xmm2 = [0,3,6,1]
258 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
259 ; AVX2-NEXT: vpermps %ymm3, %ymm2, %ymm2
260 ; AVX2-NEXT: vmovaps {{.*#+}} xmm3 = [1,4,7,2]
261 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
262 ; AVX2-NEXT: vpermps %ymm4, %ymm3, %ymm3
263 ; AVX2-NEXT: vmovaps {{.*#+}} xmm4 = [2,5,0,3]
264 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4,5,6,7]
265 ; AVX2-NEXT: vpermps %ymm0, %ymm4, %ymm0
266 ; AVX2-NEXT: vmovaps %xmm2, (%rsi)
267 ; AVX2-NEXT: vmovaps %xmm3, (%rdx)
268 ; AVX2-NEXT: vmovaps %xmm0, (%rcx)
269 ; AVX2-NEXT: vzeroupper
272 ; AVX2-FP-LABEL: load_i32_stride3_vf4:
274 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0
275 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm1
276 ; AVX2-FP-NEXT: vmovaps {{.*#+}} xmm2 = [0,3,6,1]
277 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
278 ; AVX2-FP-NEXT: vpermps %ymm3, %ymm2, %ymm2
279 ; AVX2-FP-NEXT: vmovaps {{.*#+}} xmm3 = [1,4,7,2]
280 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
281 ; AVX2-FP-NEXT: vpermps %ymm4, %ymm3, %ymm3
282 ; AVX2-FP-NEXT: vmovaps {{.*#+}} xmm4 = [2,5,0,3]
283 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4,5,6,7]
284 ; AVX2-FP-NEXT: vpermps %ymm0, %ymm4, %ymm0
285 ; AVX2-FP-NEXT: vmovaps %xmm2, (%rsi)
286 ; AVX2-FP-NEXT: vmovaps %xmm3, (%rdx)
287 ; AVX2-FP-NEXT: vmovaps %xmm0, (%rcx)
288 ; AVX2-FP-NEXT: vzeroupper
291 ; AVX2-FCP-LABEL: load_i32_stride3_vf4:
293 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0
294 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm1
295 ; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm2 = [0,3,6,1]
296 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
297 ; AVX2-FCP-NEXT: vpermps %ymm3, %ymm2, %ymm2
298 ; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm3 = [1,4,7,2]
299 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
300 ; AVX2-FCP-NEXT: vpermps %ymm4, %ymm3, %ymm3
301 ; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm4 = [2,5,0,3]
302 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4,5,6,7]
303 ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm4, %ymm0
304 ; AVX2-FCP-NEXT: vmovaps %xmm2, (%rsi)
305 ; AVX2-FCP-NEXT: vmovaps %xmm3, (%rdx)
306 ; AVX2-FCP-NEXT: vmovaps %xmm0, (%rcx)
307 ; AVX2-FCP-NEXT: vzeroupper
308 ; AVX2-FCP-NEXT: retq
310 ; AVX512-LABEL: load_i32_stride3_vf4:
312 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9]
313 ; AVX512-NEXT: vmovdqa (%rdi), %ymm1
314 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm2
315 ; AVX512-NEXT: vpermi2d %ymm2, %ymm1, %ymm0
316 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10]
317 ; AVX512-NEXT: vpermi2d %ymm2, %ymm1, %ymm3
318 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11]
319 ; AVX512-NEXT: vpermi2d %ymm2, %ymm1, %ymm4
320 ; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
321 ; AVX512-NEXT: vmovdqa %xmm3, (%rdx)
322 ; AVX512-NEXT: vmovdqa %xmm4, (%rcx)
323 ; AVX512-NEXT: vzeroupper
326 ; AVX512-FCP-LABEL: load_i32_stride3_vf4:
327 ; AVX512-FCP: # %bb.0:
328 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9]
329 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1
330 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
331 ; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm0
332 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10]
333 ; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm3
334 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11]
335 ; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm4
336 ; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rsi)
337 ; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rdx)
338 ; AVX512-FCP-NEXT: vmovdqa %xmm4, (%rcx)
339 ; AVX512-FCP-NEXT: vzeroupper
340 ; AVX512-FCP-NEXT: retq
342 ; AVX512DQ-LABEL: load_i32_stride3_vf4:
344 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9]
345 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1
346 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm2
347 ; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm1, %ymm0
348 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10]
349 ; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm1, %ymm3
350 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11]
351 ; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm1, %ymm4
352 ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rsi)
353 ; AVX512DQ-NEXT: vmovdqa %xmm3, (%rdx)
354 ; AVX512DQ-NEXT: vmovdqa %xmm4, (%rcx)
355 ; AVX512DQ-NEXT: vzeroupper
356 ; AVX512DQ-NEXT: retq
358 ; AVX512DQ-FCP-LABEL: load_i32_stride3_vf4:
359 ; AVX512DQ-FCP: # %bb.0:
360 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9]
361 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1
362 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
363 ; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm0
364 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10]
365 ; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm3
366 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11]
367 ; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm4
368 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rsi)
369 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rdx)
370 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%rcx)
371 ; AVX512DQ-FCP-NEXT: vzeroupper
372 ; AVX512DQ-FCP-NEXT: retq
374 ; AVX512BW-LABEL: load_i32_stride3_vf4:
376 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9]
377 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
378 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2
379 ; AVX512BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm0
380 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10]
381 ; AVX512BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm3
382 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11]
383 ; AVX512BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm4
384 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
385 ; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx)
386 ; AVX512BW-NEXT: vmovdqa %xmm4, (%rcx)
387 ; AVX512BW-NEXT: vzeroupper
388 ; AVX512BW-NEXT: retq
390 ; AVX512BW-FCP-LABEL: load_i32_stride3_vf4:
391 ; AVX512BW-FCP: # %bb.0:
392 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9]
393 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
394 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
395 ; AVX512BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm0
396 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10]
397 ; AVX512BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm3
398 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11]
399 ; AVX512BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm4
400 ; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rsi)
401 ; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx)
402 ; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rcx)
403 ; AVX512BW-FCP-NEXT: vzeroupper
404 ; AVX512BW-FCP-NEXT: retq
406 ; AVX512DQ-BW-LABEL: load_i32_stride3_vf4:
407 ; AVX512DQ-BW: # %bb.0:
408 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9]
409 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1
410 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm2
411 ; AVX512DQ-BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm0
412 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10]
413 ; AVX512DQ-BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm3
414 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11]
415 ; AVX512DQ-BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm4
416 ; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rsi)
417 ; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx)
418 ; AVX512DQ-BW-NEXT: vmovdqa %xmm4, (%rcx)
419 ; AVX512DQ-BW-NEXT: vzeroupper
420 ; AVX512DQ-BW-NEXT: retq
422 ; AVX512DQ-BW-FCP-LABEL: load_i32_stride3_vf4:
423 ; AVX512DQ-BW-FCP: # %bb.0:
424 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9]
425 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
426 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
427 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm0
428 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10]
429 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm3
430 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11]
431 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm4
432 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rsi)
433 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx)
434 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rcx)
435 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
436 ; AVX512DQ-BW-FCP-NEXT: retq
437 %wide.vec = load <12 x i32>, ptr %in.vec, align 64
438 %strided.vec0 = shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
439 %strided.vec1 = shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
440 %strided.vec2 = shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
441 store <4 x i32> %strided.vec0, ptr %out.vec0, align 64
442 store <4 x i32> %strided.vec1, ptr %out.vec1, align 64
443 store <4 x i32> %strided.vec2, ptr %out.vec2, align 64
447 define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
448 ; SSE-LABEL: load_i32_stride3_vf8:
450 ; SSE-NEXT: movaps 80(%rdi), %xmm1
451 ; SSE-NEXT: movaps 64(%rdi), %xmm5
452 ; SSE-NEXT: movdqa (%rdi), %xmm0
453 ; SSE-NEXT: movaps 16(%rdi), %xmm7
454 ; SSE-NEXT: movaps 32(%rdi), %xmm4
455 ; SSE-NEXT: movdqa 48(%rdi), %xmm2
456 ; SSE-NEXT: movdqa %xmm0, %xmm3
457 ; SSE-NEXT: movaps %xmm7, %xmm8
458 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3]
459 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm7[0,0]
460 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,1,1]
461 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm4[1,0]
462 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm7[0,2]
463 ; SSE-NEXT: movdqa %xmm2, %xmm7
464 ; SSE-NEXT: movaps %xmm5, %xmm10
465 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[2,3,2,3]
466 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm5[0,0]
467 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm5[1,1,1,1]
468 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[1,0]
469 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm5[0,2]
470 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm1[2,3]
471 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm10[0,2]
472 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm4[2,3]
473 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm8[0,2]
474 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1]
475 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,3]
476 ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1]
477 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[0,3]
478 ; SSE-NEXT: movaps %xmm7, 16(%rsi)
479 ; SSE-NEXT: movaps %xmm3, (%rsi)
480 ; SSE-NEXT: movaps %xmm2, 16(%rdx)
481 ; SSE-NEXT: movaps %xmm0, (%rdx)
482 ; SSE-NEXT: movaps %xmm11, 16(%rcx)
483 ; SSE-NEXT: movaps %xmm6, (%rcx)
486 ; AVX-LABEL: load_i32_stride3_vf8:
488 ; AVX-NEXT: vmovaps 64(%rdi), %ymm0
489 ; AVX-NEXT: vmovaps 32(%rdi), %ymm1
490 ; AVX-NEXT: vmovaps (%rdi), %ymm2
491 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
492 ; AVX-NEXT: vmovaps 16(%rdi), %xmm4
493 ; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1],ymm1[1,3],ymm4[6,5],ymm1[5,7]
494 ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm4[0,2],ymm3[4,7],ymm4[4,6]
495 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,0,1]
496 ; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,0],ymm4[2,0],ymm0[5,4],ymm4[6,4]
497 ; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4]
498 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
499 ; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm0[2,0],ymm4[3,0],ymm0[6,4],ymm4[7,4]
500 ; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm4[0,0],ymm5[2,0],ymm4[4,4],ymm5[6,4]
501 ; AVX-NEXT: vmovaps 16(%rdi), %xmm6
502 ; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
503 ; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm6[0,3],ymm7[5,6],ymm6[4,7]
504 ; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5]
505 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7]
506 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
507 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm2[2,0],ymm6[5,4],ymm2[6,4]
508 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[0,3],ymm2[6,4],ymm1[4,7]
509 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,1],ymm0[0,3],ymm4[4,5],ymm0[4,7]
510 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
511 ; AVX-NEXT: vmovaps %ymm3, (%rsi)
512 ; AVX-NEXT: vmovaps %ymm5, (%rdx)
513 ; AVX-NEXT: vmovaps %ymm0, (%rcx)
514 ; AVX-NEXT: vzeroupper
517 ; AVX2-LABEL: load_i32_stride3_vf8:
519 ; AVX2-NEXT: vmovaps (%rdi), %ymm0
520 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm1
521 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm2
522 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
523 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
524 ; AVX2-NEXT: vmovaps {{.*#+}} ymm4 = [0,3,6,1,4,7,2,5]
525 ; AVX2-NEXT: vpermps %ymm3, %ymm4, %ymm3
526 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
527 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
528 ; AVX2-NEXT: vmovaps {{.*#+}} ymm5 = [1,4,7,2,5,0,3,6]
529 ; AVX2-NEXT: vpermps %ymm4, %ymm5, %ymm4
530 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
531 ; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
532 ; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
533 ; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7]
534 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
535 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
536 ; AVX2-NEXT: vmovaps %ymm3, (%rsi)
537 ; AVX2-NEXT: vmovaps %ymm4, (%rdx)
538 ; AVX2-NEXT: vmovaps %ymm0, (%rcx)
539 ; AVX2-NEXT: vzeroupper
542 ; AVX2-FP-LABEL: load_i32_stride3_vf8:
544 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0
545 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm1
546 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm2
547 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
548 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
549 ; AVX2-FP-NEXT: vmovaps {{.*#+}} ymm4 = [0,3,6,1,4,7,2,5]
550 ; AVX2-FP-NEXT: vpermps %ymm3, %ymm4, %ymm3
551 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
552 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
553 ; AVX2-FP-NEXT: vmovaps {{.*#+}} ymm5 = [1,4,7,2,5,0,3,6]
554 ; AVX2-FP-NEXT: vpermps %ymm4, %ymm5, %ymm4
555 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
556 ; AVX2-FP-NEXT: vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
557 ; AVX2-FP-NEXT: vpermps %ymm0, %ymm1, %ymm0
558 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7]
559 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
560 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
561 ; AVX2-FP-NEXT: vmovaps %ymm3, (%rsi)
562 ; AVX2-FP-NEXT: vmovaps %ymm4, (%rdx)
563 ; AVX2-FP-NEXT: vmovaps %ymm0, (%rcx)
564 ; AVX2-FP-NEXT: vzeroupper
567 ; AVX2-FCP-LABEL: load_i32_stride3_vf8:
569 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0
570 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm1
571 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm2
572 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
573 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
574 ; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm4 = [0,3,6,1,4,7,2,5]
575 ; AVX2-FCP-NEXT: vpermps %ymm3, %ymm4, %ymm3
576 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
577 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
578 ; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm5 = [1,4,7,2,5,0,3,6]
579 ; AVX2-FCP-NEXT: vpermps %ymm4, %ymm5, %ymm4
580 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
581 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
582 ; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,1,4,7]
583 ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm0
584 ; AVX2-FCP-NEXT: vmovaps %ymm3, (%rsi)
585 ; AVX2-FCP-NEXT: vmovaps %ymm4, (%rdx)
586 ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rcx)
587 ; AVX2-FCP-NEXT: vzeroupper
588 ; AVX2-FCP-NEXT: retq
590 ; AVX512-LABEL: load_i32_stride3_vf8:
592 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
593 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1
594 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21]
595 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
596 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22]
597 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
598 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23]
599 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm4
600 ; AVX512-NEXT: vmovdqa %ymm2, (%rsi)
601 ; AVX512-NEXT: vmovdqa %ymm3, (%rdx)
602 ; AVX512-NEXT: vmovdqa %ymm4, (%rcx)
603 ; AVX512-NEXT: vzeroupper
606 ; AVX512-FCP-LABEL: load_i32_stride3_vf8:
607 ; AVX512-FCP: # %bb.0:
608 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
609 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
610 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21]
611 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
612 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22]
613 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
614 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23]
615 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4
616 ; AVX512-FCP-NEXT: vmovdqa %ymm2, (%rsi)
617 ; AVX512-FCP-NEXT: vmovdqa %ymm3, (%rdx)
618 ; AVX512-FCP-NEXT: vmovdqa %ymm4, (%rcx)
619 ; AVX512-FCP-NEXT: vzeroupper
620 ; AVX512-FCP-NEXT: retq
622 ; AVX512DQ-LABEL: load_i32_stride3_vf8:
624 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0
625 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1
626 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21]
627 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
628 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22]
629 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
630 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23]
631 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm4
632 ; AVX512DQ-NEXT: vmovdqa %ymm2, (%rsi)
633 ; AVX512DQ-NEXT: vmovdqa %ymm3, (%rdx)
634 ; AVX512DQ-NEXT: vmovdqa %ymm4, (%rcx)
635 ; AVX512DQ-NEXT: vzeroupper
636 ; AVX512DQ-NEXT: retq
638 ; AVX512DQ-FCP-LABEL: load_i32_stride3_vf8:
639 ; AVX512DQ-FCP: # %bb.0:
640 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
641 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
642 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21]
643 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
644 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22]
645 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
646 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23]
647 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4
648 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%rsi)
649 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%rdx)
650 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%rcx)
651 ; AVX512DQ-FCP-NEXT: vzeroupper
652 ; AVX512DQ-FCP-NEXT: retq
654 ; AVX512BW-LABEL: load_i32_stride3_vf8:
656 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
657 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
658 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21]
659 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
660 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22]
661 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
662 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23]
663 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4
664 ; AVX512BW-NEXT: vmovdqa %ymm2, (%rsi)
665 ; AVX512BW-NEXT: vmovdqa %ymm3, (%rdx)
666 ; AVX512BW-NEXT: vmovdqa %ymm4, (%rcx)
667 ; AVX512BW-NEXT: vzeroupper
668 ; AVX512BW-NEXT: retq
670 ; AVX512BW-FCP-LABEL: load_i32_stride3_vf8:
671 ; AVX512BW-FCP: # %bb.0:
672 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
673 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
674 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21]
675 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
676 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22]
677 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
678 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23]
679 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4
680 ; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rsi)
681 ; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rdx)
682 ; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%rcx)
683 ; AVX512BW-FCP-NEXT: vzeroupper
684 ; AVX512BW-FCP-NEXT: retq
686 ; AVX512DQ-BW-LABEL: load_i32_stride3_vf8:
687 ; AVX512DQ-BW: # %bb.0:
688 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
689 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1
690 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21]
691 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
692 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22]
693 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
694 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23]
695 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4
696 ; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%rsi)
697 ; AVX512DQ-BW-NEXT: vmovdqa %ymm3, (%rdx)
698 ; AVX512DQ-BW-NEXT: vmovdqa %ymm4, (%rcx)
699 ; AVX512DQ-BW-NEXT: vzeroupper
700 ; AVX512DQ-BW-NEXT: retq
702 ; AVX512DQ-BW-FCP-LABEL: load_i32_stride3_vf8:
703 ; AVX512DQ-BW-FCP: # %bb.0:
704 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
705 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
706 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21]
707 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
708 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22]
709 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
710 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23]
711 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4
712 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rsi)
713 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rdx)
714 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%rcx)
715 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
716 ; AVX512DQ-BW-FCP-NEXT: retq
717 %wide.vec = load <24 x i32>, ptr %in.vec, align 64
718 %strided.vec0 = shufflevector <24 x i32> %wide.vec, <24 x i32> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
719 %strided.vec1 = shufflevector <24 x i32> %wide.vec, <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
720 %strided.vec2 = shufflevector <24 x i32> %wide.vec, <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
721 store <8 x i32> %strided.vec0, ptr %out.vec0, align 64
722 store <8 x i32> %strided.vec1, ptr %out.vec1, align 64
723 store <8 x i32> %strided.vec2, ptr %out.vec2, align 64
727 define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
728 ; SSE-LABEL: load_i32_stride3_vf16:
730 ; SSE-NEXT: movaps 96(%rdi), %xmm6
731 ; SSE-NEXT: movaps 128(%rdi), %xmm12
732 ; SSE-NEXT: movaps 112(%rdi), %xmm13
733 ; SSE-NEXT: movaps 144(%rdi), %xmm11
734 ; SSE-NEXT: movaps 176(%rdi), %xmm10
735 ; SSE-NEXT: movaps 160(%rdi), %xmm9
736 ; SSE-NEXT: movaps (%rdi), %xmm7
737 ; SSE-NEXT: movaps 16(%rdi), %xmm8
738 ; SSE-NEXT: movaps 32(%rdi), %xmm3
739 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
740 ; SSE-NEXT: movaps 48(%rdi), %xmm15
741 ; SSE-NEXT: movaps 80(%rdi), %xmm14
742 ; SSE-NEXT: movaps 64(%rdi), %xmm2
743 ; SSE-NEXT: movaps %xmm2, %xmm0
744 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm14[1,0]
745 ; SSE-NEXT: movaps %xmm15, %xmm5
746 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,2]
747 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
748 ; SSE-NEXT: movaps %xmm8, %xmm0
749 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,0]
750 ; SSE-NEXT: movaps %xmm7, %xmm5
751 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,2]
752 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
753 ; SSE-NEXT: movaps %xmm9, %xmm0
754 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[1,0]
755 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
756 ; SSE-NEXT: movaps %xmm11, %xmm3
757 ; SSE-NEXT: movaps %xmm11, %xmm4
758 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
759 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm0[0,2]
760 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
761 ; SSE-NEXT: movaps %xmm13, %xmm0
762 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm12[1,0]
763 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
764 ; SSE-NEXT: movaps %xmm6, %xmm5
765 ; SSE-NEXT: movaps %xmm6, %xmm3
766 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,2]
767 ; SSE-NEXT: movaps %xmm15, %xmm11
768 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm2[0,0]
769 ; SSE-NEXT: movaps %xmm2, %xmm0
770 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm14[2,3]
771 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2]
772 ; SSE-NEXT: movaps %xmm4, %xmm6
773 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm9[0,0]
774 ; SSE-NEXT: movaps %xmm9, %xmm0
775 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm10[2,3]
776 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[0,2]
777 ; SSE-NEXT: movaps %xmm3, %xmm1
778 ; SSE-NEXT: movaps %xmm3, %xmm10
779 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm13[0,0]
780 ; SSE-NEXT: movaps %xmm13, %xmm0
781 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm12[2,3]
782 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2]
783 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
784 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm8[0,0]
785 ; SSE-NEXT: movaps %xmm8, %xmm12
786 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
787 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm4[2,3]
788 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm12[0,2]
789 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[1,1,1,1]
790 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,3,2,3]
791 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1]
792 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm14[0,3]
793 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,1,1]
794 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
795 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3]
796 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,1,1]
797 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
798 ; SSE-NEXT: # xmm8 = mem[2,3,2,3]
799 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1]
800 ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
801 ; SSE-NEXT: # xmm8 = xmm8[0,1],mem[0,3]
802 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[1,1,1,1]
803 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,3,2,3]
804 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
805 ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
806 ; SSE-NEXT: # xmm4 = xmm4[0,1],mem[0,3]
807 ; SSE-NEXT: movaps %xmm5, 32(%rsi)
808 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
809 ; SSE-NEXT: movaps %xmm3, 48(%rsi)
810 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
811 ; SSE-NEXT: movaps %xmm3, (%rsi)
812 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
813 ; SSE-NEXT: movaps %xmm3, 16(%rsi)
814 ; SSE-NEXT: movaps %xmm1, 32(%rdx)
815 ; SSE-NEXT: movaps %xmm6, 48(%rdx)
816 ; SSE-NEXT: movaps %xmm7, (%rdx)
817 ; SSE-NEXT: movaps %xmm11, 16(%rdx)
818 ; SSE-NEXT: movaps %xmm4, 32(%rcx)
819 ; SSE-NEXT: movaps %xmm8, 48(%rcx)
820 ; SSE-NEXT: movaps %xmm0, (%rcx)
821 ; SSE-NEXT: movaps %xmm2, 16(%rcx)
824 ; AVX-LABEL: load_i32_stride3_vf16:
826 ; AVX-NEXT: vmovaps 160(%rdi), %ymm0
827 ; AVX-NEXT: vmovaps 128(%rdi), %ymm1
828 ; AVX-NEXT: vmovaps 96(%rdi), %ymm2
829 ; AVX-NEXT: vmovaps 64(%rdi), %ymm3
830 ; AVX-NEXT: vmovaps 32(%rdi), %ymm4
831 ; AVX-NEXT: vmovaps (%rdi), %ymm6
832 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6],ymm4[7]
833 ; AVX-NEXT: vmovaps 16(%rdi), %xmm7
834 ; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,1],ymm4[1,3],ymm7[6,5],ymm4[5,7]
835 ; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,3],ymm7[0,2],ymm5[4,7],ymm7[4,6]
836 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm3[2,3,0,1]
837 ; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,0],ymm7[2,0],ymm3[5,4],ymm7[6,4]
838 ; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4]
839 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7]
840 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
841 ; AVX-NEXT: vmovaps 112(%rdi), %xmm9
842 ; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,1],ymm1[1,3],ymm9[6,5],ymm1[5,7]
843 ; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,3],ymm9[0,2],ymm8[4,7],ymm9[4,6]
844 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3,0,1]
845 ; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm0[1,0],ymm9[2,0],ymm0[5,4],ymm9[6,4]
846 ; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
847 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7]
848 ; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm3[2,0],ymm7[3,0],ymm3[6,4],ymm7[7,4]
849 ; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,0],ymm10[2,0],ymm7[4,4],ymm10[6,4]
850 ; AVX-NEXT: vmovaps 16(%rdi), %xmm11
851 ; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7]
852 ; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm12[1,2],ymm11[0,3],ymm12[5,6],ymm11[4,7]
853 ; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,3,1,4,6,7,5]
854 ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5,6,7]
855 ; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,0],ymm9[3,0],ymm0[6,4],ymm9[7,4]
856 ; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm9[0,0],ymm12[2,0],ymm9[4,4],ymm12[6,4]
857 ; AVX-NEXT: vmovaps 112(%rdi), %xmm13
858 ; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
859 ; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[1,2],ymm13[0,3],ymm14[5,6],ymm13[4,7]
860 ; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,3,1,4,6,7,5]
861 ; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5,6,7]
862 ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7]
863 ; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm11[1,0],ymm6[2,0],ymm11[5,4],ymm6[6,4]
864 ; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm6[2,0],ymm4[0,3],ymm6[6,4],ymm4[4,7]
865 ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm7[0,1],ymm3[0,3],ymm7[4,5],ymm3[4,7]
866 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7]
867 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
868 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm13[1,0],ymm2[2,0],ymm13[5,4],ymm2[6,4]
869 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[0,3],ymm2[6,4],ymm1[4,7]
870 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm9[0,1],ymm0[0,3],ymm9[4,5],ymm0[4,7]
871 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
872 ; AVX-NEXT: vmovaps %ymm8, 32(%rsi)
873 ; AVX-NEXT: vmovaps %ymm5, (%rsi)
874 ; AVX-NEXT: vmovaps %ymm12, 32(%rdx)
875 ; AVX-NEXT: vmovaps %ymm10, (%rdx)
876 ; AVX-NEXT: vmovaps %ymm0, 32(%rcx)
877 ; AVX-NEXT: vmovaps %ymm3, (%rcx)
878 ; AVX-NEXT: vzeroupper
881 ; AVX2-LABEL: load_i32_stride3_vf16:
883 ; AVX2-NEXT: vmovaps 160(%rdi), %ymm0
884 ; AVX2-NEXT: vmovaps 128(%rdi), %ymm1
885 ; AVX2-NEXT: vmovaps (%rdi), %ymm2
886 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm3
887 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm4
888 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm5
889 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
890 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7]
891 ; AVX2-NEXT: vmovaps {{.*#+}} ymm7 = [0,3,6,1,4,7,2,5]
892 ; AVX2-NEXT: vpermps %ymm6, %ymm7, %ymm6
893 ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6],ymm1[7]
894 ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7]
895 ; AVX2-NEXT: vpermps %ymm8, %ymm7, %ymm7
896 ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
897 ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0],ymm8[1,2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7]
898 ; AVX2-NEXT: vmovaps {{.*#+}} ymm9 = [1,4,7,2,5,0,3,6]
899 ; AVX2-NEXT: vpermps %ymm8, %ymm9, %ymm8
900 ; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7]
901 ; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0],ymm10[1,2],ymm0[3],ymm10[4,5],ymm0[6],ymm10[7]
902 ; AVX2-NEXT: vpermps %ymm10, %ymm9, %ymm9
903 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
904 ; AVX2-NEXT: vmovaps {{.*#+}} ymm3 = [2,5,0,3,6,u,u,u]
905 ; AVX2-NEXT: vpermps %ymm2, %ymm3, %ymm2
906 ; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,3,4,5,4,7]
907 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
908 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7]
909 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7]
910 ; AVX2-NEXT: vpermps %ymm1, %ymm3, %ymm1
911 ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,3,4,5,4,7]
912 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
913 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
914 ; AVX2-NEXT: vmovaps %ymm7, 32(%rsi)
915 ; AVX2-NEXT: vmovaps %ymm6, (%rsi)
916 ; AVX2-NEXT: vmovaps %ymm9, 32(%rdx)
917 ; AVX2-NEXT: vmovaps %ymm8, (%rdx)
918 ; AVX2-NEXT: vmovaps %ymm0, 32(%rcx)
919 ; AVX2-NEXT: vmovaps %ymm2, (%rcx)
920 ; AVX2-NEXT: vzeroupper
923 ; AVX2-FP-LABEL: load_i32_stride3_vf16:
925 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm0
926 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm1
927 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm2
928 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm3
929 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm4
930 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm5
931 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
932 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7]
933 ; AVX2-FP-NEXT: vmovaps {{.*#+}} ymm7 = [0,3,6,1,4,7,2,5]
934 ; AVX2-FP-NEXT: vpermps %ymm6, %ymm7, %ymm6
935 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6],ymm1[7]
936 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7]
937 ; AVX2-FP-NEXT: vpermps %ymm8, %ymm7, %ymm7
938 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
939 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0],ymm8[1,2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7]
940 ; AVX2-FP-NEXT: vmovaps {{.*#+}} ymm9 = [1,4,7,2,5,0,3,6]
941 ; AVX2-FP-NEXT: vpermps %ymm8, %ymm9, %ymm8
942 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7]
943 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0],ymm10[1,2],ymm0[3],ymm10[4,5],ymm0[6],ymm10[7]
944 ; AVX2-FP-NEXT: vpermps %ymm10, %ymm9, %ymm9
945 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
946 ; AVX2-FP-NEXT: vmovaps {{.*#+}} ymm3 = [2,5,0,3,6,u,u,u]
947 ; AVX2-FP-NEXT: vpermps %ymm2, %ymm3, %ymm2
948 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,3,4,5,4,7]
949 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
950 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7]
951 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7]
952 ; AVX2-FP-NEXT: vpermps %ymm1, %ymm3, %ymm1
953 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,3,4,5,4,7]
954 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
955 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
956 ; AVX2-FP-NEXT: vmovaps %ymm7, 32(%rsi)
957 ; AVX2-FP-NEXT: vmovaps %ymm6, (%rsi)
958 ; AVX2-FP-NEXT: vmovaps %ymm9, 32(%rdx)
959 ; AVX2-FP-NEXT: vmovaps %ymm8, (%rdx)
960 ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rcx)
961 ; AVX2-FP-NEXT: vmovaps %ymm2, (%rcx)
962 ; AVX2-FP-NEXT: vzeroupper
965 ; AVX2-FCP-LABEL: load_i32_stride3_vf16:
967 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm0
968 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm1
969 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm2
970 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm3
971 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm4
972 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm5
973 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
974 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7]
975 ; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm7 = [0,3,6,1,4,7,2,5]
976 ; AVX2-FCP-NEXT: vpermps %ymm6, %ymm7, %ymm6
977 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6],ymm1[7]
978 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7]
979 ; AVX2-FCP-NEXT: vpermps %ymm8, %ymm7, %ymm7
980 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
981 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0],ymm8[1,2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7]
982 ; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm9 = [1,4,7,2,5,0,3,6]
983 ; AVX2-FCP-NEXT: vpermps %ymm8, %ymm9, %ymm8
984 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7]
985 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0],ymm10[1,2],ymm0[3],ymm10[4,5],ymm0[6],ymm10[7]
986 ; AVX2-FCP-NEXT: vpermps %ymm10, %ymm9, %ymm9
987 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
988 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7]
989 ; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm3 = [2,5,0,3,6,1,4,7]
990 ; AVX2-FCP-NEXT: vpermps %ymm2, %ymm3, %ymm2
991 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7]
992 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
993 ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm3, %ymm0
994 ; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%rsi)
995 ; AVX2-FCP-NEXT: vmovaps %ymm6, (%rsi)
996 ; AVX2-FCP-NEXT: vmovaps %ymm9, 32(%rdx)
997 ; AVX2-FCP-NEXT: vmovaps %ymm8, (%rdx)
998 ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rcx)
999 ; AVX2-FCP-NEXT: vmovaps %ymm2, (%rcx)
1000 ; AVX2-FCP-NEXT: vzeroupper
1001 ; AVX2-FCP-NEXT: retq
1003 ; AVX512-LABEL: load_i32_stride3_vf16:
1005 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
1006 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1
1007 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2
1008 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
1009 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
1010 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
1011 ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm4
1012 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
1013 ; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm3
1014 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
1015 ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm5
1016 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
1017 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
1018 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
1019 ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm0
1020 ; AVX512-NEXT: vmovdqa64 %zmm4, (%rsi)
1021 ; AVX512-NEXT: vmovdqa64 %zmm5, (%rdx)
1022 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rcx)
1023 ; AVX512-NEXT: vzeroupper
1026 ; AVX512-FCP-LABEL: load_i32_stride3_vf16:
1027 ; AVX512-FCP: # %bb.0:
1028 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1029 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
1030 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
1031 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
1032 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
1033 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
1034 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4
1035 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
1036 ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3
1037 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
1038 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5
1039 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
1040 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
1041 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
1042 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0
1043 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rsi)
1044 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rdx)
1045 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rcx)
1046 ; AVX512-FCP-NEXT: vzeroupper
1047 ; AVX512-FCP-NEXT: retq
1049 ; AVX512DQ-LABEL: load_i32_stride3_vf16:
1050 ; AVX512DQ: # %bb.0:
1051 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0
1052 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1
1053 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2
1054 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
1055 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
1056 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
1057 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm4
1058 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
1059 ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm3
1060 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
1061 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm5
1062 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
1063 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
1064 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
1065 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm0
1066 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rsi)
1067 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rdx)
1068 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rcx)
1069 ; AVX512DQ-NEXT: vzeroupper
1070 ; AVX512DQ-NEXT: retq
1072 ; AVX512DQ-FCP-LABEL: load_i32_stride3_vf16:
1073 ; AVX512DQ-FCP: # %bb.0:
1074 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1075 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
1076 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
1077 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
1078 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
1079 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
1080 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4
1081 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
1082 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3
1083 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
1084 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5
1085 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
1086 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
1087 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
1088 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0
1089 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rsi)
1090 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rdx)
1091 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rcx)
1092 ; AVX512DQ-FCP-NEXT: vzeroupper
1093 ; AVX512DQ-FCP-NEXT: retq
1095 ; AVX512BW-LABEL: load_i32_stride3_vf16:
1096 ; AVX512BW: # %bb.0:
1097 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1098 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
1099 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2
1100 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
1101 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
1102 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
1103 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm4
1104 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
1105 ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3
1106 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
1107 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm5
1108 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
1109 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
1110 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
1111 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm0
1112 ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rsi)
1113 ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rdx)
1114 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
1115 ; AVX512BW-NEXT: vzeroupper
1116 ; AVX512BW-NEXT: retq
1118 ; AVX512BW-FCP-LABEL: load_i32_stride3_vf16:
1119 ; AVX512BW-FCP: # %bb.0:
1120 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1121 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
1122 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
1123 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
1124 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
1125 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
1126 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4
1127 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
1128 ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3
1129 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
1130 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5
1131 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
1132 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
1133 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
1134 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0
1135 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi)
1136 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx)
1137 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rcx)
1138 ; AVX512BW-FCP-NEXT: vzeroupper
1139 ; AVX512BW-FCP-NEXT: retq
1141 ; AVX512DQ-BW-LABEL: load_i32_stride3_vf16:
1142 ; AVX512DQ-BW: # %bb.0:
1143 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
1144 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1
1145 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2
1146 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
1147 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
1148 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
1149 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm4
1150 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
1151 ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3
1152 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
1153 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm5
1154 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
1155 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
1156 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
1157 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm0
1158 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rsi)
1159 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rdx)
1160 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rcx)
1161 ; AVX512DQ-BW-NEXT: vzeroupper
1162 ; AVX512DQ-BW-NEXT: retq
1164 ; AVX512DQ-BW-FCP-LABEL: load_i32_stride3_vf16:
1165 ; AVX512DQ-BW-FCP: # %bb.0:
1166 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1167 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
1168 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
1169 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
1170 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
1171 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
1172 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4
1173 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
1174 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3
1175 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
1176 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5
1177 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
1178 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
1179 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
1180 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0
1181 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi)
1182 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx)
1183 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rcx)
1184 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
1185 ; AVX512DQ-BW-FCP-NEXT: retq
1186 %wide.vec = load <48 x i32>, ptr %in.vec, align 64
1187 %strided.vec0 = shufflevector <48 x i32> %wide.vec, <48 x i32> poison, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
1188 %strided.vec1 = shufflevector <48 x i32> %wide.vec, <48 x i32> poison, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
1189 %strided.vec2 = shufflevector <48 x i32> %wide.vec, <48 x i32> poison, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
1190 store <16 x i32> %strided.vec0, ptr %out.vec0, align 64
1191 store <16 x i32> %strided.vec1, ptr %out.vec1, align 64
1192 store <16 x i32> %strided.vec2, ptr %out.vec2, align 64
1196 define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
1197 ; SSE-LABEL: load_i32_stride3_vf32:
1199 ; SSE-NEXT: subq $392, %rsp # imm = 0x188
1200 ; SSE-NEXT: movaps 192(%rdi), %xmm4
1201 ; SSE-NEXT: movaps 224(%rdi), %xmm3
1202 ; SSE-NEXT: movaps 208(%rdi), %xmm14
1203 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1204 ; SSE-NEXT: movaps 240(%rdi), %xmm7
1205 ; SSE-NEXT: movaps 272(%rdi), %xmm10
1206 ; SSE-NEXT: movaps 256(%rdi), %xmm9
1207 ; SSE-NEXT: movaps (%rdi), %xmm13
1208 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1209 ; SSE-NEXT: movaps 16(%rdi), %xmm8
1210 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1211 ; SSE-NEXT: movaps 32(%rdi), %xmm11
1212 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1213 ; SSE-NEXT: movaps 48(%rdi), %xmm2
1214 ; SSE-NEXT: movaps 80(%rdi), %xmm12
1215 ; SSE-NEXT: movaps 64(%rdi), %xmm5
1216 ; SSE-NEXT: movaps %xmm5, %xmm0
1217 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm12[1,0]
1218 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1219 ; SSE-NEXT: movaps %xmm2, %xmm1
1220 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1221 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
1222 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1223 ; SSE-NEXT: movaps %xmm9, %xmm0
1224 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[1,0]
1225 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1226 ; SSE-NEXT: movaps %xmm7, %xmm1
1227 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1228 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
1229 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1230 ; SSE-NEXT: movaps %xmm8, %xmm0
1231 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[1,0]
1232 ; SSE-NEXT: movaps %xmm13, %xmm1
1233 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
1234 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1235 ; SSE-NEXT: movaps %xmm14, %xmm0
1236 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,0]
1237 ; SSE-NEXT: movaps %xmm3, %xmm13
1238 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1239 ; SSE-NEXT: movaps %xmm4, %xmm1
1240 ; SSE-NEXT: movaps %xmm4, %xmm11
1241 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1242 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
1243 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1244 ; SSE-NEXT: movaps 176(%rdi), %xmm1
1245 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1246 ; SSE-NEXT: movaps 160(%rdi), %xmm0
1247 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1248 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0]
1249 ; SSE-NEXT: movaps 144(%rdi), %xmm3
1250 ; SSE-NEXT: movaps %xmm3, %xmm1
1251 ; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill
1252 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
1253 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1254 ; SSE-NEXT: movaps 368(%rdi), %xmm1
1255 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1256 ; SSE-NEXT: movaps 352(%rdi), %xmm0
1257 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1258 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0]
1259 ; SSE-NEXT: movaps 336(%rdi), %xmm4
1260 ; SSE-NEXT: movaps %xmm4, %xmm1
1261 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1262 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
1263 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1264 ; SSE-NEXT: movaps 128(%rdi), %xmm1
1265 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1266 ; SSE-NEXT: movaps 112(%rdi), %xmm15
1267 ; SSE-NEXT: movaps %xmm15, %xmm0
1268 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0]
1269 ; SSE-NEXT: movaps 96(%rdi), %xmm1
1270 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1271 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
1272 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1273 ; SSE-NEXT: movaps 320(%rdi), %xmm1
1274 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1275 ; SSE-NEXT: movaps 304(%rdi), %xmm6
1276 ; SSE-NEXT: movaps %xmm6, %xmm0
1277 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0]
1278 ; SSE-NEXT: movaps 288(%rdi), %xmm8
1279 ; SSE-NEXT: movaps %xmm8, %xmm1
1280 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1281 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
1282 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1283 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1284 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm5[0,0]
1285 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm12[2,3]
1286 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm5[0,2]
1287 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1288 ; SSE-NEXT: movaps %xmm7, %xmm14
1289 ; SSE-NEXT: movaps %xmm9, %xmm0
1290 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1291 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm9[0,0]
1292 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm10[2,3]
1293 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[0,2]
1294 ; SSE-NEXT: movaps %xmm11, %xmm10
1295 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1296 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm0[0,0]
1297 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm13[2,3]
1298 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,2]
1299 ; SSE-NEXT: movaps %xmm3, %xmm9
1300 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1301 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm0[0,0]
1302 ; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1303 ; SSE-NEXT: # xmm0 = xmm0[3,1],mem[2,3]
1304 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[0,2]
1305 ; SSE-NEXT: movaps %xmm4, %xmm7
1306 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1307 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm0[0,0]
1308 ; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1309 ; SSE-NEXT: # xmm0 = xmm0[3,1],mem[2,3]
1310 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm0[0,2]
1311 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
1312 ; SSE-NEXT: movaps %xmm4, %xmm11
1313 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm15[0,0]
1314 ; SSE-NEXT: movaps %xmm15, %xmm0
1315 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
1316 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm12[2,3]
1317 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2]
1318 ; SSE-NEXT: movaps %xmm8, %xmm3
1319 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1320 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm6[0,0]
1321 ; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
1322 ; SSE-NEXT: # xmm6 = xmm6[3,1],mem[2,3]
1323 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm6[0,2]
1324 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1325 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
1326 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1327 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0]
1328 ; SSE-NEXT: movaps %xmm0, %xmm13
1329 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1330 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm1[2,3]
1331 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm13[0,2]
1332 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1333 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
1334 ; SSE-NEXT: # xmm13 = mem[1,1,1,1]
1335 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
1336 ; SSE-NEXT: # xmm8 = mem[2,3,2,3]
1337 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1]
1338 ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
1339 ; SSE-NEXT: # xmm8 = xmm8[0,1],mem[0,3]
1340 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,1,1]
1341 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1]
1342 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[0,3]
1343 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
1344 ; SSE-NEXT: # xmm13 = mem[1,1,1,1]
1345 ; SSE-NEXT: pshufd $238, (%rsp), %xmm6 # 16-byte Folded Reload
1346 ; SSE-NEXT: # xmm6 = mem[2,3,2,3]
1347 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1]
1348 ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
1349 ; SSE-NEXT: # xmm6 = xmm6[0,1],mem[0,3]
1350 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm15[1,1,1,1]
1351 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
1352 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1]
1353 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm12[0,3]
1354 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
1355 ; SSE-NEXT: # xmm15 = mem[1,1,1,1]
1356 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
1357 ; SSE-NEXT: # xmm13 = mem[2,3,2,3]
1358 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1]
1359 ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
1360 ; SSE-NEXT: # xmm13 = xmm13[0,1],mem[0,3]
1361 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1362 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
1363 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
1364 ; SSE-NEXT: # xmm15 = mem[2,3,2,3]
1365 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
1366 ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
1367 ; SSE-NEXT: # xmm15 = xmm15[0,1],mem[0,3]
1368 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1369 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
1370 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1371 ; SSE-NEXT: # xmm1 = mem[2,3,2,3]
1372 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1373 ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1374 ; SSE-NEXT: # xmm1 = xmm1[0,1],mem[0,3]
1375 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
1376 ; SSE-NEXT: # xmm2 = mem[1,1,1,1]
1377 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1378 ; SSE-NEXT: # xmm0 = mem[2,3,2,3]
1379 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1380 ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1381 ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,3]
1382 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1383 ; SSE-NEXT: movaps %xmm2, 96(%rsi)
1384 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1385 ; SSE-NEXT: movaps %xmm2, 32(%rsi)
1386 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1387 ; SSE-NEXT: movaps %xmm2, 112(%rsi)
1388 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1389 ; SSE-NEXT: movaps %xmm2, 48(%rsi)
1390 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1391 ; SSE-NEXT: movaps %xmm2, 64(%rsi)
1392 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1393 ; SSE-NEXT: movaps %xmm2, (%rsi)
1394 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1395 ; SSE-NEXT: movaps %xmm2, 80(%rsi)
1396 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1397 ; SSE-NEXT: movaps %xmm2, 16(%rsi)
1398 ; SSE-NEXT: movaps %xmm3, 96(%rdx)
1399 ; SSE-NEXT: movaps %xmm11, 32(%rdx)
1400 ; SSE-NEXT: movaps %xmm7, 112(%rdx)
1401 ; SSE-NEXT: movaps %xmm9, 48(%rdx)
1402 ; SSE-NEXT: movaps %xmm10, 64(%rdx)
1403 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1404 ; SSE-NEXT: movaps %xmm2, (%rdx)
1405 ; SSE-NEXT: movaps %xmm14, 80(%rdx)
1406 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1407 ; SSE-NEXT: movaps %xmm2, 16(%rdx)
1408 ; SSE-NEXT: movaps %xmm0, 96(%rcx)
1409 ; SSE-NEXT: movaps %xmm1, 112(%rcx)
1410 ; SSE-NEXT: movaps %xmm15, 64(%rcx)
1411 ; SSE-NEXT: movaps %xmm13, 80(%rcx)
1412 ; SSE-NEXT: movaps %xmm4, 32(%rcx)
1413 ; SSE-NEXT: movaps %xmm6, 48(%rcx)
1414 ; SSE-NEXT: movaps %xmm5, (%rcx)
1415 ; SSE-NEXT: movaps %xmm8, 16(%rcx)
1416 ; SSE-NEXT: addq $392, %rsp # imm = 0x188
1419 ; AVX-LABEL: load_i32_stride3_vf32:
1421 ; AVX-NEXT: subq $392, %rsp # imm = 0x188
1422 ; AVX-NEXT: vmovaps 256(%rdi), %ymm2
1423 ; AVX-NEXT: vmovaps 224(%rdi), %ymm7
1424 ; AVX-NEXT: vmovaps 192(%rdi), %ymm3
1425 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1426 ; AVX-NEXT: vmovaps 352(%rdi), %ymm4
1427 ; AVX-NEXT: vmovaps 320(%rdi), %ymm5
1428 ; AVX-NEXT: vmovaps 288(%rdi), %ymm6
1429 ; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1430 ; AVX-NEXT: vmovaps 160(%rdi), %ymm10
1431 ; AVX-NEXT: vmovaps 128(%rdi), %ymm9
1432 ; AVX-NEXT: vmovaps 96(%rdi), %ymm0
1433 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1434 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6],ymm9[7]
1435 ; AVX-NEXT: vmovaps 112(%rdi), %xmm1
1436 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm9[1,3],ymm1[6,5],ymm9[5,7]
1437 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
1438 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm10[2,3,0,1]
1439 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,0],ymm11[2,0],ymm10[5,4],ymm11[6,4]
1440 ; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1441 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
1442 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
1443 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1444 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7]
1445 ; AVX-NEXT: vmovaps 304(%rdi), %xmm1
1446 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm5[1,3],ymm1[6,5],ymm5[5,7]
1447 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
1448 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm4[2,3,0,1]
1449 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm14[2,0],ymm4[5,4],ymm14[6,4]
1450 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1451 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
1452 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
1453 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1454 ; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1455 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm7[1],ymm3[2,3],ymm7[4],ymm3[5,6],ymm7[7]
1456 ; AVX-NEXT: vmovaps 208(%rdi), %xmm1
1457 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm7[1,3],ymm1[6,5],ymm7[5,7]
1458 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
1459 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1460 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm2[2,3,0,1]
1461 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm12[2,0],ymm2[5,4],ymm12[6,4]
1462 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
1463 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
1464 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1465 ; AVX-NEXT: vmovaps 32(%rdi), %ymm15
1466 ; AVX-NEXT: vmovaps 16(%rdi), %xmm0
1467 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm15[1,3],ymm0[6,5],ymm15[5,7]
1468 ; AVX-NEXT: vmovaps (%rdi), %ymm2
1469 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm15[1],ymm2[2,3],ymm15[4],ymm2[5,6],ymm15[7]
1470 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1471 ; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6]
1472 ; AVX-NEXT: vmovaps 64(%rdi), %ymm7
1473 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3,0,1]
1474 ; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm7[1,0],ymm0[2,0],ymm7[5,4],ymm0[6,4]
1475 ; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4]
1476 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm8[6,7]
1477 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1478 ; AVX-NEXT: vmovups %ymm11, (%rsp) # 32-byte Spill
1479 ; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm10[2,0],ymm11[3,0],ymm10[6,4],ymm11[7,4]
1480 ; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm11[0,0],ymm8[2,0],ymm11[4,4],ymm8[6,4]
1481 ; AVX-NEXT: vmovaps 112(%rdi), %xmm13
1482 ; AVX-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload
1483 ; AVX-NEXT: # ymm6 = mem[0,1],ymm9[2],mem[3,4],ymm9[5],mem[6,7]
1484 ; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,2],ymm13[0,3],ymm6[5,6],ymm13[4,7]
1485 ; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,3,1,4,6,7,5]
1486 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm8[5,6,7]
1487 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1488 ; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm4[2,0],ymm14[3,0],ymm4[6,4],ymm14[7,4]
1489 ; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm14[0,0],ymm6[2,0],ymm14[4,4],ymm6[6,4]
1490 ; AVX-NEXT: vmovaps 304(%rdi), %xmm8
1491 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1492 ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7]
1493 ; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,2],ymm8[0,3],ymm10[5,6],ymm8[4,7]
1494 ; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,2,3,1,4,6,7,5]
1495 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3,4],ymm6[5,6,7]
1496 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1497 ; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm0[3,0],ymm7[6,4],ymm0[7,4]
1498 ; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm0[0,0],ymm6[2,0],ymm0[4,4],ymm6[6,4]
1499 ; AVX-NEXT: vmovaps 16(%rdi), %xmm11
1500 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm15[2],ymm2[3,4],ymm15[5],ymm2[6,7]
1501 ; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,2],ymm11[0,3],ymm4[5,6],ymm11[4,7]
1502 ; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,3,1,4,6,7,5]
1503 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm6[5,6,7]
1504 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1505 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1506 ; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm1[2,0],ymm12[3,0],ymm1[6,4],ymm12[7,4]
1507 ; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm12[0,0],ymm4[2,0],ymm12[4,4],ymm4[6,4]
1508 ; AVX-NEXT: vmovaps 208(%rdi), %xmm10
1509 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1510 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1511 ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
1512 ; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,2],ymm10[0,3],ymm6[5,6],ymm10[4,7]
1513 ; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,3,1,4,6,7,5]
1514 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5,6,7]
1515 ; AVX-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload
1516 ; AVX-NEXT: # ymm6 = ymm9[0,1],mem[2],ymm9[3,4],mem[5],ymm9[6,7]
1517 ; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm13[1,0],ymm6[2,0],ymm13[5,4],ymm6[6,4]
1518 ; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0],ymm9[0,3],ymm6[6,4],ymm9[4,7]
1519 ; AVX-NEXT: vmovups (%rsp), %ymm9 # 32-byte Reload
1520 ; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
1521 ; AVX-NEXT: # ymm9 = ymm9[0,1],mem[0,3],ymm9[4,5],mem[4,7]
1522 ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm9[5,6,7]
1523 ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7]
1524 ; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,0],ymm9[2,0],ymm8[5,4],ymm9[6,4]
1525 ; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm8[2,0],ymm5[0,3],ymm8[6,4],ymm5[4,7]
1526 ; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload
1527 ; AVX-NEXT: # ymm8 = ymm14[0,1],mem[0,3],ymm14[4,5],mem[4,7]
1528 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5,6,7]
1529 ; AVX-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload
1530 ; AVX-NEXT: # ymm3 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7]
1531 ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm11[1,0],ymm3[2,0],ymm11[5,4],ymm3[6,4]
1532 ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm15[0,3],ymm3[6,4],ymm15[4,7]
1533 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm7[0,3],ymm0[4,5],ymm7[4,7]
1534 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
1535 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
1536 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,0],ymm1[2,0],ymm10[5,4],ymm1[6,4]
1537 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[0,3],ymm1[6,4],ymm2[4,7]
1538 ; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload
1539 ; AVX-NEXT: # ymm2 = ymm12[0,1],mem[0,3],ymm12[4,5],mem[4,7]
1540 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
1541 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1542 ; AVX-NEXT: vmovaps %ymm2, (%rsi)
1543 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1544 ; AVX-NEXT: vmovaps %ymm2, 64(%rsi)
1545 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1546 ; AVX-NEXT: vmovaps %ymm2, 96(%rsi)
1547 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1548 ; AVX-NEXT: vmovaps %ymm2, 32(%rsi)
1549 ; AVX-NEXT: vmovaps %ymm4, 64(%rdx)
1550 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1551 ; AVX-NEXT: vmovaps %ymm2, (%rdx)
1552 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1553 ; AVX-NEXT: vmovaps %ymm2, 96(%rdx)
1554 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1555 ; AVX-NEXT: vmovaps %ymm2, 32(%rdx)
1556 ; AVX-NEXT: vmovaps %ymm1, 64(%rcx)
1557 ; AVX-NEXT: vmovaps %ymm0, (%rcx)
1558 ; AVX-NEXT: vmovaps %ymm5, 96(%rcx)
1559 ; AVX-NEXT: vmovaps %ymm6, 32(%rcx)
1560 ; AVX-NEXT: addq $392, %rsp # imm = 0x188
1561 ; AVX-NEXT: vzeroupper
1564 ; AVX2-LABEL: load_i32_stride3_vf32:
1566 ; AVX2-NEXT: subq $40, %rsp
1567 ; AVX2-NEXT: vmovaps 256(%rdi), %ymm15
1568 ; AVX2-NEXT: vmovaps 224(%rdi), %ymm5
1569 ; AVX2-NEXT: vmovaps 192(%rdi), %ymm3
1570 ; AVX2-NEXT: vmovaps 352(%rdi), %ymm4
1571 ; AVX2-NEXT: vmovaps 320(%rdi), %ymm8
1572 ; AVX2-NEXT: vmovaps 288(%rdi), %ymm10
1573 ; AVX2-NEXT: vmovaps 160(%rdi), %ymm13
1574 ; AVX2-NEXT: vmovaps 128(%rdi), %ymm1
1575 ; AVX2-NEXT: vmovaps (%rdi), %ymm6
1576 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm9
1577 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm7
1578 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm0
1579 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
1580 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7]
1581 ; AVX2-NEXT: vmovaps {{.*#+}} ymm12 = [0,3,6,1,4,7,2,5]
1582 ; AVX2-NEXT: vpermps %ymm11, %ymm12, %ymm2
1583 ; AVX2-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill
1584 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5,6],ymm8[7]
1585 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm4[2],ymm11[3,4],ymm4[5],ymm11[6,7]
1586 ; AVX2-NEXT: vpermps %ymm11, %ymm12, %ymm2
1587 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1588 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm6[0],ymm9[1],ymm6[2,3],ymm9[4],ymm6[5,6],ymm9[7]
1589 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7]
1590 ; AVX2-NEXT: vpermps %ymm11, %ymm12, %ymm2
1591 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1592 ; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7]
1593 ; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7]
1594 ; AVX2-NEXT: vpermps %ymm14, %ymm12, %ymm2
1595 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1596 ; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1597 ; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7]
1598 ; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = [1,4,7,2,5,0,3,6]
1599 ; AVX2-NEXT: vpermps %ymm12, %ymm2, %ymm11
1600 ; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1601 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm10[0,1],ymm8[2],ymm10[3,4],ymm8[5],ymm10[6,7]
1602 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0],ymm11[1,2],ymm4[3],ymm11[4,5],ymm4[6],ymm11[7]
1603 ; AVX2-NEXT: vpermps %ymm11, %ymm2, %ymm11
1604 ; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7]
1605 ; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm7[0],ymm12[1,2],ymm7[3],ymm12[4,5],ymm7[6],ymm12[7]
1606 ; AVX2-NEXT: vpermps %ymm12, %ymm2, %ymm12
1607 ; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7]
1608 ; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7]
1609 ; AVX2-NEXT: vpermps %ymm14, %ymm2, %ymm2
1610 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
1611 ; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
1612 ; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
1613 ; AVX2-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,0,3,4,5,4,7]
1614 ; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3]
1615 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7]
1616 ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2],ymm8[3,4],ymm10[5],ymm8[6,7]
1617 ; AVX2-NEXT: vpermps %ymm8, %ymm1, %ymm8
1618 ; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,3,4,5,4,7]
1619 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
1620 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7]
1621 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2],ymm9[3,4],ymm6[5],ymm9[6,7]
1622 ; AVX2-NEXT: vpermps %ymm6, %ymm1, %ymm6
1623 ; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,0,3,4,5,4,7]
1624 ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
1625 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7]
1626 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7]
1627 ; AVX2-NEXT: vpermps %ymm3, %ymm1, %ymm1
1628 ; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,1,0,3,4,5,4,7]
1629 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
1630 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
1631 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1632 ; AVX2-NEXT: vmovaps %ymm3, 64(%rsi)
1633 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1634 ; AVX2-NEXT: vmovaps %ymm3, (%rsi)
1635 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1636 ; AVX2-NEXT: vmovaps %ymm3, 96(%rsi)
1637 ; AVX2-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload
1638 ; AVX2-NEXT: vmovaps %ymm3, 32(%rsi)
1639 ; AVX2-NEXT: vmovaps %ymm2, 64(%rdx)
1640 ; AVX2-NEXT: vmovaps %ymm12, (%rdx)
1641 ; AVX2-NEXT: vmovaps %ymm11, 96(%rdx)
1642 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1643 ; AVX2-NEXT: vmovaps %ymm2, 32(%rdx)
1644 ; AVX2-NEXT: vmovaps %ymm1, 64(%rcx)
1645 ; AVX2-NEXT: vmovaps %ymm6, (%rcx)
1646 ; AVX2-NEXT: vmovaps %ymm4, 96(%rcx)
1647 ; AVX2-NEXT: vmovaps %ymm0, 32(%rcx)
1648 ; AVX2-NEXT: addq $40, %rsp
1649 ; AVX2-NEXT: vzeroupper
1652 ; AVX2-FP-LABEL: load_i32_stride3_vf32:
1654 ; AVX2-FP-NEXT: subq $40, %rsp
1655 ; AVX2-FP-NEXT: vmovaps 256(%rdi), %ymm15
1656 ; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm5
1657 ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm3
1658 ; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm4
1659 ; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm8
1660 ; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm10
1661 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm13
1662 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm1
1663 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm6
1664 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm9
1665 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm7
1666 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm0
1667 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
1668 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7]
1669 ; AVX2-FP-NEXT: vmovaps {{.*#+}} ymm12 = [0,3,6,1,4,7,2,5]
1670 ; AVX2-FP-NEXT: vpermps %ymm11, %ymm12, %ymm2
1671 ; AVX2-FP-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill
1672 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5,6],ymm8[7]
1673 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm4[2],ymm11[3,4],ymm4[5],ymm11[6,7]
1674 ; AVX2-FP-NEXT: vpermps %ymm11, %ymm12, %ymm2
1675 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1676 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm6[0],ymm9[1],ymm6[2,3],ymm9[4],ymm6[5,6],ymm9[7]
1677 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7]
1678 ; AVX2-FP-NEXT: vpermps %ymm11, %ymm12, %ymm2
1679 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1680 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7]
1681 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7]
1682 ; AVX2-FP-NEXT: vpermps %ymm14, %ymm12, %ymm2
1683 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1684 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1685 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7]
1686 ; AVX2-FP-NEXT: vmovaps {{.*#+}} ymm2 = [1,4,7,2,5,0,3,6]
1687 ; AVX2-FP-NEXT: vpermps %ymm12, %ymm2, %ymm11
1688 ; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1689 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm10[0,1],ymm8[2],ymm10[3,4],ymm8[5],ymm10[6,7]
1690 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0],ymm11[1,2],ymm4[3],ymm11[4,5],ymm4[6],ymm11[7]
1691 ; AVX2-FP-NEXT: vpermps %ymm11, %ymm2, %ymm11
1692 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7]
1693 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm7[0],ymm12[1,2],ymm7[3],ymm12[4,5],ymm7[6],ymm12[7]
1694 ; AVX2-FP-NEXT: vpermps %ymm12, %ymm2, %ymm12
1695 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7]
1696 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7]
1697 ; AVX2-FP-NEXT: vpermps %ymm14, %ymm2, %ymm2
1698 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
1699 ; AVX2-FP-NEXT: vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
1700 ; AVX2-FP-NEXT: vpermps %ymm0, %ymm1, %ymm0
1701 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,0,3,4,5,4,7]
1702 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3]
1703 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7]
1704 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2],ymm8[3,4],ymm10[5],ymm8[6,7]
1705 ; AVX2-FP-NEXT: vpermps %ymm8, %ymm1, %ymm8
1706 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,3,4,5,4,7]
1707 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
1708 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7]
1709 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2],ymm9[3,4],ymm6[5],ymm9[6,7]
1710 ; AVX2-FP-NEXT: vpermps %ymm6, %ymm1, %ymm6
1711 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,0,3,4,5,4,7]
1712 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
1713 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7]
1714 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7]
1715 ; AVX2-FP-NEXT: vpermps %ymm3, %ymm1, %ymm1
1716 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,1,0,3,4,5,4,7]
1717 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
1718 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
1719 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1720 ; AVX2-FP-NEXT: vmovaps %ymm3, 64(%rsi)
1721 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1722 ; AVX2-FP-NEXT: vmovaps %ymm3, (%rsi)
1723 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1724 ; AVX2-FP-NEXT: vmovaps %ymm3, 96(%rsi)
1725 ; AVX2-FP-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload
1726 ; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rsi)
1727 ; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rdx)
1728 ; AVX2-FP-NEXT: vmovaps %ymm12, (%rdx)
1729 ; AVX2-FP-NEXT: vmovaps %ymm11, 96(%rdx)
1730 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1731 ; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rdx)
1732 ; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rcx)
1733 ; AVX2-FP-NEXT: vmovaps %ymm6, (%rcx)
1734 ; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rcx)
1735 ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rcx)
1736 ; AVX2-FP-NEXT: addq $40, %rsp
1737 ; AVX2-FP-NEXT: vzeroupper
1738 ; AVX2-FP-NEXT: retq
1740 ; AVX2-FCP-LABEL: load_i32_stride3_vf32:
1741 ; AVX2-FCP: # %bb.0:
1742 ; AVX2-FCP-NEXT: subq $72, %rsp
1743 ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm0
1744 ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm2
1745 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1746 ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm1
1747 ; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm4
1748 ; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm6
1749 ; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm7
1750 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm13
1751 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm14
1752 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm8
1753 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm9
1754 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm10
1755 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm15
1756 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6],ymm14[7]
1757 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm13[2],ymm3[3,4],ymm13[5],ymm3[6,7]
1758 ; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm12 = [0,3,6,1,4,7,2,5]
1759 ; AVX2-FCP-NEXT: vpermps %ymm3, %ymm12, %ymm3
1760 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1761 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7]
1762 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
1763 ; AVX2-FCP-NEXT: vpermps %ymm5, %ymm12, %ymm3
1764 ; AVX2-FCP-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill
1765 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7]
1766 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7]
1767 ; AVX2-FCP-NEXT: vpermps %ymm11, %ymm12, %ymm3
1768 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1769 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
1770 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7]
1771 ; AVX2-FCP-NEXT: vmovaps %ymm0, %ymm2
1772 ; AVX2-FCP-NEXT: vpermps %ymm3, %ymm12, %ymm0
1773 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1774 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7]
1775 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7]
1776 ; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm3 = [1,4,7,2,5,0,3,6]
1777 ; AVX2-FCP-NEXT: vpermps %ymm12, %ymm3, %ymm5
1778 ; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1779 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7]
1780 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
1781 ; AVX2-FCP-NEXT: vpermps %ymm5, %ymm3, %ymm5
1782 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
1783 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7]
1784 ; AVX2-FCP-NEXT: vpermps %ymm11, %ymm3, %ymm11
1785 ; AVX2-FCP-NEXT: vmovaps %ymm1, %ymm0
1786 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1787 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1788 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm2[0],ymm12[1,2],ymm2[3],ymm12[4,5],ymm2[6],ymm12[7]
1789 ; AVX2-FCP-NEXT: vpermps %ymm12, %ymm3, %ymm3
1790 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7]
1791 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6],ymm13[7]
1792 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7]
1793 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6],ymm4[7]
1794 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7]
1795 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2,3],ymm10[4],ymm6[5,6],ymm10[7]
1796 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
1797 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
1798 ; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm2 = [2,5,0,3,6,1,4,7]
1799 ; AVX2-FCP-NEXT: vpermps %ymm12, %ymm2, %ymm7
1800 ; AVX2-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm4
1801 ; AVX2-FCP-NEXT: vpermps %ymm6, %ymm2, %ymm6
1802 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm1
1803 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1804 ; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rsi)
1805 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1806 ; AVX2-FCP-NEXT: vmovaps %ymm2, (%rsi)
1807 ; AVX2-FCP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload
1808 ; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rsi)
1809 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1810 ; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rsi)
1811 ; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rdx)
1812 ; AVX2-FCP-NEXT: vmovaps %ymm11, (%rdx)
1813 ; AVX2-FCP-NEXT: vmovaps %ymm5, 96(%rdx)
1814 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1815 ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rdx)
1816 ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rcx)
1817 ; AVX2-FCP-NEXT: vmovaps %ymm6, (%rcx)
1818 ; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rcx)
1819 ; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%rcx)
1820 ; AVX2-FCP-NEXT: addq $72, %rsp
1821 ; AVX2-FCP-NEXT: vzeroupper
1822 ; AVX2-FCP-NEXT: retq
1824 ; AVX512-LABEL: load_i32_stride3_vf32:
1826 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm0
1827 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm1
1828 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2
1829 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3
1830 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm4
1831 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm5
1832 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
1833 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7
1834 ; AVX512-NEXT: vpermt2d %zmm1, %zmm6, %zmm7
1835 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
1836 ; AVX512-NEXT: vpermt2d %zmm0, %zmm8, %zmm7
1837 ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm6
1838 ; AVX512-NEXT: vpermt2d %zmm4, %zmm8, %zmm6
1839 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
1840 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm9
1841 ; AVX512-NEXT: vpermt2d %zmm5, %zmm8, %zmm9
1842 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
1843 ; AVX512-NEXT: vpermt2d %zmm0, %zmm10, %zmm9
1844 ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm8
1845 ; AVX512-NEXT: vpermt2d %zmm4, %zmm10, %zmm8
1846 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
1847 ; AVX512-NEXT: vpermt2d %zmm1, %zmm10, %zmm5
1848 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
1849 ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm5
1850 ; AVX512-NEXT: vpermt2d %zmm3, %zmm10, %zmm2
1851 ; AVX512-NEXT: vpermt2d %zmm4, %zmm1, %zmm2
1852 ; AVX512-NEXT: vmovdqa64 %zmm7, 64(%rsi)
1853 ; AVX512-NEXT: vmovdqa64 %zmm6, (%rsi)
1854 ; AVX512-NEXT: vmovdqa64 %zmm9, 64(%rdx)
1855 ; AVX512-NEXT: vmovdqa64 %zmm8, (%rdx)
1856 ; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rcx)
1857 ; AVX512-NEXT: vmovdqa64 %zmm2, (%rcx)
1858 ; AVX512-NEXT: vzeroupper
1861 ; AVX512-FCP-LABEL: load_i32_stride3_vf32:
1862 ; AVX512-FCP: # %bb.0:
1863 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0
1864 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1
1865 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
1866 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3
1867 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4
1868 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5
1869 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
1870 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7
1871 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm7
1872 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
1873 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm7
1874 ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6
1875 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm6
1876 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
1877 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm9
1878 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm9
1879 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
1880 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm9
1881 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm8
1882 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm8
1883 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
1884 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm10, %zmm5
1885 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
1886 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5
1887 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm10, %zmm2
1888 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm2
1889 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 64(%rsi)
1890 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%rsi)
1891 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 64(%rdx)
1892 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%rdx)
1893 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx)
1894 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rcx)
1895 ; AVX512-FCP-NEXT: vzeroupper
1896 ; AVX512-FCP-NEXT: retq
1898 ; AVX512DQ-LABEL: load_i32_stride3_vf32:
1899 ; AVX512DQ: # %bb.0:
1900 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm0
1901 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm1
1902 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2
1903 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm3
1904 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm4
1905 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm5
1906 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
1907 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7
1908 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm6, %zmm7
1909 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
1910 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm8, %zmm7
1911 ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm6
1912 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm8, %zmm6
1913 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
1914 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm9
1915 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm8, %zmm9
1916 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
1917 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm10, %zmm9
1918 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm8
1919 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm10, %zmm8
1920 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
1921 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm10, %zmm5
1922 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
1923 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm5
1924 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm10, %zmm2
1925 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm1, %zmm2
1926 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 64(%rsi)
1927 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, (%rsi)
1928 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 64(%rdx)
1929 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rdx)
1930 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rcx)
1931 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rcx)
1932 ; AVX512DQ-NEXT: vzeroupper
1933 ; AVX512DQ-NEXT: retq
1935 ; AVX512DQ-FCP-LABEL: load_i32_stride3_vf32:
1936 ; AVX512DQ-FCP: # %bb.0:
1937 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0
1938 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1
1939 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
1940 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3
1941 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4
1942 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5
1943 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
1944 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7
1945 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm7
1946 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
1947 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm7
1948 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6
1949 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm6
1950 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
1951 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm9
1952 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm9
1953 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
1954 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm9
1955 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm8
1956 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm8
1957 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
1958 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm10, %zmm5
1959 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
1960 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5
1961 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm10, %zmm2
1962 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm2
1963 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 64(%rsi)
1964 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%rsi)
1965 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 64(%rdx)
1966 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rdx)
1967 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx)
1968 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rcx)
1969 ; AVX512DQ-FCP-NEXT: vzeroupper
1970 ; AVX512DQ-FCP-NEXT: retq
1972 ; AVX512BW-LABEL: load_i32_stride3_vf32:
1973 ; AVX512BW: # %bb.0:
1974 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm0
1975 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm1
1976 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2
1977 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3
1978 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4
1979 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm5
1980 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
1981 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7
1982 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm6, %zmm7
1983 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
1984 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm7
1985 ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6
1986 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm6
1987 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
1988 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9
1989 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm9
1990 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
1991 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm9
1992 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm8
1993 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm8
1994 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
1995 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm10, %zmm5
1996 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
1997 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm5
1998 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm10, %zmm2
1999 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm2
2000 ; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rsi)
2001 ; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rsi)
2002 ; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rdx)
2003 ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rdx)
2004 ; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rcx)
2005 ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rcx)
2006 ; AVX512BW-NEXT: vzeroupper
2007 ; AVX512BW-NEXT: retq
2009 ; AVX512BW-FCP-LABEL: load_i32_stride3_vf32:
2010 ; AVX512BW-FCP: # %bb.0:
2011 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0
2012 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1
2013 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
2014 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3
2015 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4
2016 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5
2017 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
2018 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7
2019 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm7
2020 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
2021 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm7
2022 ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6
2023 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm6
2024 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
2025 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9
2026 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm9
2027 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
2028 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm9
2029 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm8
2030 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm8
2031 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
2032 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm10, %zmm5
2033 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
2034 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5
2035 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm10, %zmm2
2036 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm2
2037 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%rsi)
2038 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%rsi)
2039 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%rdx)
2040 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rdx)
2041 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx)
2042 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rcx)
2043 ; AVX512BW-FCP-NEXT: vzeroupper
2044 ; AVX512BW-FCP-NEXT: retq
2046 ; AVX512DQ-BW-LABEL: load_i32_stride3_vf32:
2047 ; AVX512DQ-BW: # %bb.0:
2048 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm0
2049 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm1
2050 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2
2051 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm3
2052 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm4
2053 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm5
2054 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
2055 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7
2056 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm6, %zmm7
2057 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
2058 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm7
2059 ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6
2060 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm6
2061 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
2062 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm9
2063 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm9
2064 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
2065 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm9
2066 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm8
2067 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm8
2068 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
2069 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm10, %zmm5
2070 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
2071 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm5
2072 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm10, %zmm2
2073 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm2
2074 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 64(%rsi)
2075 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, (%rsi)
2076 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 64(%rdx)
2077 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rdx)
2078 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%rcx)
2079 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rcx)
2080 ; AVX512DQ-BW-NEXT: vzeroupper
2081 ; AVX512DQ-BW-NEXT: retq
2083 ; AVX512DQ-BW-FCP-LABEL: load_i32_stride3_vf32:
2084 ; AVX512DQ-BW-FCP: # %bb.0:
2085 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0
2086 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1
2087 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
2088 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3
2089 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4
2090 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5
2091 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
2092 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7
2093 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm7
2094 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
2095 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm7
2096 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6
2097 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm6
2098 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
2099 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9
2100 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm9
2101 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
2102 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm9
2103 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm8
2104 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm8
2105 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
2106 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm10, %zmm5
2107 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
2108 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5
2109 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm10, %zmm2
2110 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm2
2111 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%rsi)
2112 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%rsi)
2113 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%rdx)
2114 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rdx)
2115 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx)
2116 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rcx)
2117 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
2118 ; AVX512DQ-BW-FCP-NEXT: retq
2119 %wide.vec = load <96 x i32>, ptr %in.vec, align 64
2120 %strided.vec0 = shufflevector <96 x i32> %wide.vec, <96 x i32> poison, <32 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45, i32 48, i32 51, i32 54, i32 57, i32 60, i32 63, i32 66, i32 69, i32 72, i32 75, i32 78, i32 81, i32 84, i32 87, i32 90, i32 93>
2121 %strided.vec1 = shufflevector <96 x i32> %wide.vec, <96 x i32> poison, <32 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46, i32 49, i32 52, i32 55, i32 58, i32 61, i32 64, i32 67, i32 70, i32 73, i32 76, i32 79, i32 82, i32 85, i32 88, i32 91, i32 94>
2122 %strided.vec2 = shufflevector <96 x i32> %wide.vec, <96 x i32> poison, <32 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47, i32 50, i32 53, i32 56, i32 59, i32 62, i32 65, i32 68, i32 71, i32 74, i32 77, i32 80, i32 83, i32 86, i32 89, i32 92, i32 95>
2123 store <32 x i32> %strided.vec0, ptr %out.vec0, align 64
2124 store <32 x i32> %strided.vec1, ptr %out.vec1, align 64
2125 store <32 x i32> %strided.vec2, ptr %out.vec2, align 64
2129 define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
2130 ; SSE-LABEL: load_i32_stride3_vf64:
2132 ; SSE-NEXT: subq $1112, %rsp # imm = 0x458
2133 ; SSE-NEXT: movaps 624(%rdi), %xmm2
2134 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2135 ; SSE-NEXT: movaps 656(%rdi), %xmm4
2136 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2137 ; SSE-NEXT: movaps 640(%rdi), %xmm10
2138 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2139 ; SSE-NEXT: movaps 432(%rdi), %xmm6
2140 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2141 ; SSE-NEXT: movaps 464(%rdi), %xmm5
2142 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2143 ; SSE-NEXT: movaps 448(%rdi), %xmm11
2144 ; SSE-NEXT: movaps %xmm11, (%rsp) # 16-byte Spill
2145 ; SSE-NEXT: movaps 240(%rdi), %xmm7
2146 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2147 ; SSE-NEXT: movaps 272(%rdi), %xmm3
2148 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2149 ; SSE-NEXT: movaps 256(%rdi), %xmm13
2150 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2151 ; SSE-NEXT: movaps 48(%rdi), %xmm9
2152 ; SSE-NEXT: movaps 80(%rdi), %xmm1
2153 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2154 ; SSE-NEXT: movaps 64(%rdi), %xmm12
2155 ; SSE-NEXT: movaps %xmm12, %xmm0
2156 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0]
2157 ; SSE-NEXT: movaps %xmm9, %xmm1
2158 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2159 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
2160 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2161 ; SSE-NEXT: movaps %xmm13, %xmm0
2162 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,0]
2163 ; SSE-NEXT: movaps %xmm7, %xmm1
2164 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
2165 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2166 ; SSE-NEXT: movaps %xmm11, %xmm0
2167 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[1,0]
2168 ; SSE-NEXT: movaps %xmm6, %xmm1
2169 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
2170 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2171 ; SSE-NEXT: movaps %xmm10, %xmm0
2172 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[1,0]
2173 ; SSE-NEXT: movaps %xmm2, %xmm1
2174 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
2175 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2176 ; SSE-NEXT: movaps 16(%rdi), %xmm0
2177 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2178 ; SSE-NEXT: movaps 32(%rdi), %xmm1
2179 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2180 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0]
2181 ; SSE-NEXT: movaps (%rdi), %xmm1
2182 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2183 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
2184 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2185 ; SSE-NEXT: movaps 224(%rdi), %xmm1
2186 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2187 ; SSE-NEXT: movaps 208(%rdi), %xmm0
2188 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2189 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0]
2190 ; SSE-NEXT: movaps 192(%rdi), %xmm1
2191 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2192 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
2193 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2194 ; SSE-NEXT: movaps 416(%rdi), %xmm1
2195 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2196 ; SSE-NEXT: movaps 400(%rdi), %xmm0
2197 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2198 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0]
2199 ; SSE-NEXT: movaps 384(%rdi), %xmm1
2200 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2201 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
2202 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2203 ; SSE-NEXT: movaps 608(%rdi), %xmm1
2204 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2205 ; SSE-NEXT: movaps 592(%rdi), %xmm0
2206 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2207 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0]
2208 ; SSE-NEXT: movaps 576(%rdi), %xmm1
2209 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2210 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
2211 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2212 ; SSE-NEXT: movaps 176(%rdi), %xmm10
2213 ; SSE-NEXT: movaps 160(%rdi), %xmm8
2214 ; SSE-NEXT: movaps %xmm8, %xmm0
2215 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[1,0]
2216 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2217 ; SSE-NEXT: movaps 144(%rdi), %xmm2
2218 ; SSE-NEXT: movaps %xmm2, %xmm1
2219 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2220 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
2221 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2222 ; SSE-NEXT: movaps 368(%rdi), %xmm1
2223 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2224 ; SSE-NEXT: movaps 352(%rdi), %xmm15
2225 ; SSE-NEXT: movaps %xmm15, %xmm0
2226 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0]
2227 ; SSE-NEXT: movaps 336(%rdi), %xmm14
2228 ; SSE-NEXT: movaps %xmm14, %xmm1
2229 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2230 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
2231 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2232 ; SSE-NEXT: movaps 560(%rdi), %xmm1
2233 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2234 ; SSE-NEXT: movaps 544(%rdi), %xmm0
2235 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2236 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0]
2237 ; SSE-NEXT: movaps 528(%rdi), %xmm1
2238 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2239 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
2240 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2241 ; SSE-NEXT: movaps 752(%rdi), %xmm1
2242 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2243 ; SSE-NEXT: movaps 736(%rdi), %xmm0
2244 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2245 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0]
2246 ; SSE-NEXT: movaps 720(%rdi), %xmm1
2247 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2248 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
2249 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2250 ; SSE-NEXT: movaps 128(%rdi), %xmm6
2251 ; SSE-NEXT: movaps 112(%rdi), %xmm4
2252 ; SSE-NEXT: movaps %xmm4, %xmm1
2253 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm6[1,0]
2254 ; SSE-NEXT: movaps 96(%rdi), %xmm3
2255 ; SSE-NEXT: movaps %xmm3, %xmm7
2256 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2257 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm1[0,2]
2258 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2259 ; SSE-NEXT: movaps 320(%rdi), %xmm13
2260 ; SSE-NEXT: movaps 304(%rdi), %xmm11
2261 ; SSE-NEXT: movaps %xmm11, %xmm1
2262 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm13[1,0]
2263 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2264 ; SSE-NEXT: movaps 288(%rdi), %xmm5
2265 ; SSE-NEXT: movaps %xmm5, %xmm7
2266 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2267 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm1[0,2]
2268 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2269 ; SSE-NEXT: movaps 512(%rdi), %xmm0
2270 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2271 ; SSE-NEXT: movaps 496(%rdi), %xmm1
2272 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2273 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0]
2274 ; SSE-NEXT: movaps 480(%rdi), %xmm7
2275 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2276 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm1[0,2]
2277 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2278 ; SSE-NEXT: movaps 704(%rdi), %xmm7
2279 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2280 ; SSE-NEXT: movaps 688(%rdi), %xmm1
2281 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2282 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[1,0]
2283 ; SSE-NEXT: movaps 672(%rdi), %xmm7
2284 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2285 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm1[0,2]
2286 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2287 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2288 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm12[0,0]
2289 ; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
2290 ; SSE-NEXT: # xmm12 = xmm12[3,1],mem[2,3]
2291 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm12[0,2]
2292 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2293 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2294 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm8[0,0]
2295 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm10[2,3]
2296 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm8[0,2]
2297 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2298 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2299 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm4[0,0]
2300 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm6[2,3]
2301 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
2302 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2303 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
2304 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2305 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm1[0,0]
2306 ; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2307 ; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3]
2308 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm1[0,2]
2309 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2310 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
2311 ; SSE-NEXT: movaps %xmm9, %xmm0
2312 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2313 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[0,0]
2314 ; SSE-NEXT: movaps %xmm3, %xmm1
2315 ; SSE-NEXT: movaps %xmm3, %xmm12
2316 ; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2317 ; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3]
2318 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2319 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2320 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2321 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm15[0,0]
2322 ; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
2323 ; SSE-NEXT: # xmm15 = xmm15[3,1],mem[2,3]
2324 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm15[0,2]
2325 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2326 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2327 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm11[0,0]
2328 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm13[2,3]
2329 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm11[0,2]
2330 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2331 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2332 ; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
2333 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
2334 ; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2335 ; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3]
2336 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2337 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2338 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
2339 ; SSE-NEXT: movaps %xmm7, %xmm0
2340 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2341 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[0,0]
2342 ; SSE-NEXT: movaps %xmm4, %xmm1
2343 ; SSE-NEXT: movaps %xmm4, %xmm8
2344 ; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2345 ; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3]
2346 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2347 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2348 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2349 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2350 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
2351 ; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2352 ; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3]
2353 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2354 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2355 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2356 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2357 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
2358 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2359 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm5[2,3]
2360 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2361 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2362 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2363 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2364 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
2365 ; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2366 ; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3]
2367 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2368 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2369 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2370 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
2371 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm14[0,0]
2372 ; SSE-NEXT: movaps %xmm14, %xmm1
2373 ; SSE-NEXT: movaps %xmm14, %xmm3
2374 ; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2375 ; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3]
2376 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2377 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2378 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2379 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2380 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
2381 ; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2382 ; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3]
2383 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2384 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2385 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
2386 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2387 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm1[0,0]
2388 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2389 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm4[2,3]
2390 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm1[0,2]
2391 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2392 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[2,3,2,3]
2393 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2394 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0]
2395 ; SSE-NEXT: movaps %xmm2, %xmm11
2396 ; SSE-NEXT: movaps %xmm0, %xmm1
2397 ; SSE-NEXT: movaps %xmm0, %xmm2
2398 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2399 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
2400 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm1[0,2]
2401 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2402 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
2403 ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1]
2404 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[0,3]
2405 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2406 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2407 ; SSE-NEXT: # xmm1 = mem[1,1,1,1]
2408 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2409 ; SSE-NEXT: # xmm0 = mem[2,3,2,3]
2410 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2411 ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2412 ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,3]
2413 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2414 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2415 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
2416 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
2417 ; SSE-NEXT: # xmm15 = mem[2,3,2,3]
2418 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
2419 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm6[0,3]
2420 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2421 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
2422 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
2423 ; SSE-NEXT: # xmm13 = mem[2,3,2,3]
2424 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1]
2425 ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
2426 ; SSE-NEXT: # xmm13 = xmm13[0,1],mem[0,3]
2427 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1]
2428 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[2,3,2,3]
2429 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1]
2430 ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
2431 ; SSE-NEXT: # xmm12 = xmm12[0,1],mem[0,3]
2432 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2433 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
2434 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
2435 ; SSE-NEXT: # xmm11 = mem[2,3,2,3]
2436 ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1]
2437 ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
2438 ; SSE-NEXT: # xmm11 = xmm11[0,1],mem[0,3]
2439 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2440 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
2441 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
2442 ; SSE-NEXT: # xmm10 = mem[2,3,2,3]
2443 ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
2444 ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
2445 ; SSE-NEXT: # xmm10 = xmm10[0,1],mem[0,3]
2446 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2447 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
2448 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
2449 ; SSE-NEXT: # xmm9 = mem[2,3,2,3]
2450 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
2451 ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
2452 ; SSE-NEXT: # xmm9 = xmm9[0,1],mem[0,3]
2453 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
2454 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[2,3,2,3]
2455 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
2456 ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
2457 ; SSE-NEXT: # xmm8 = xmm8[0,1],mem[0,3]
2458 ; SSE-NEXT: pshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload
2459 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
2460 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
2461 ; SSE-NEXT: # xmm7 = mem[2,3,2,3]
2462 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
2463 ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
2464 ; SSE-NEXT: # xmm7 = xmm7[0,1],mem[0,3]
2465 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2466 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
2467 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
2468 ; SSE-NEXT: # xmm6 = mem[2,3,2,3]
2469 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
2470 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[0,3]
2471 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2472 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
2473 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
2474 ; SSE-NEXT: # xmm5 = mem[2,3,2,3]
2475 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
2476 ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
2477 ; SSE-NEXT: # xmm5 = xmm5[0,1],mem[0,3]
2478 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1]
2479 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
2480 ; SSE-NEXT: # xmm3 = mem[2,3,2,3]
2481 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
2482 ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
2483 ; SSE-NEXT: # xmm3 = xmm3[0,1],mem[0,3]
2484 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2485 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
2486 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
2487 ; SSE-NEXT: # xmm2 = mem[2,3,2,3]
2488 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
2489 ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
2490 ; SSE-NEXT: # xmm2 = xmm2[0,1],mem[0,3]
2491 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2492 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
2493 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2494 ; SSE-NEXT: # xmm1 = mem[2,3,2,3]
2495 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2496 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0,3]
2497 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
2498 ; SSE-NEXT: # xmm4 = mem[1,1,1,1]
2499 ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2500 ; SSE-NEXT: # xmm0 = mem[2,3,2,3]
2501 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
2502 ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2503 ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,3]
2504 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2505 ; SSE-NEXT: movaps %xmm4, 224(%rsi)
2506 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2507 ; SSE-NEXT: movaps %xmm4, 160(%rsi)
2508 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2509 ; SSE-NEXT: movaps %xmm4, 96(%rsi)
2510 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2511 ; SSE-NEXT: movaps %xmm4, 32(%rsi)
2512 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2513 ; SSE-NEXT: movaps %xmm4, 240(%rsi)
2514 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2515 ; SSE-NEXT: movaps %xmm4, 176(%rsi)
2516 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2517 ; SSE-NEXT: movaps %xmm4, 112(%rsi)
2518 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2519 ; SSE-NEXT: movaps %xmm4, 48(%rsi)
2520 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2521 ; SSE-NEXT: movaps %xmm4, 192(%rsi)
2522 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2523 ; SSE-NEXT: movaps %xmm4, 128(%rsi)
2524 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2525 ; SSE-NEXT: movaps %xmm4, 64(%rsi)
2526 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2527 ; SSE-NEXT: movaps %xmm4, (%rsi)
2528 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2529 ; SSE-NEXT: movaps %xmm4, 208(%rsi)
2530 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2531 ; SSE-NEXT: movaps %xmm4, 144(%rsi)
2532 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2533 ; SSE-NEXT: movaps %xmm4, 80(%rsi)
2534 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2535 ; SSE-NEXT: movaps %xmm4, 16(%rsi)
2536 ; SSE-NEXT: movaps %xmm14, 224(%rdx)
2537 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2538 ; SSE-NEXT: movaps %xmm4, 240(%rdx)
2539 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2540 ; SSE-NEXT: movaps %xmm4, 192(%rdx)
2541 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2542 ; SSE-NEXT: movaps %xmm4, 208(%rdx)
2543 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2544 ; SSE-NEXT: movaps %xmm4, 160(%rdx)
2545 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2546 ; SSE-NEXT: movaps %xmm4, 176(%rdx)
2547 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2548 ; SSE-NEXT: movaps %xmm4, 128(%rdx)
2549 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2550 ; SSE-NEXT: movaps %xmm4, 144(%rdx)
2551 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2552 ; SSE-NEXT: movaps %xmm4, 96(%rdx)
2553 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2554 ; SSE-NEXT: movaps %xmm4, 112(%rdx)
2555 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2556 ; SSE-NEXT: movaps %xmm4, 64(%rdx)
2557 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2558 ; SSE-NEXT: movaps %xmm4, 80(%rdx)
2559 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2560 ; SSE-NEXT: movaps %xmm4, 32(%rdx)
2561 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2562 ; SSE-NEXT: movaps %xmm4, 48(%rdx)
2563 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2564 ; SSE-NEXT: movaps %xmm4, (%rdx)
2565 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2566 ; SSE-NEXT: movaps %xmm4, 16(%rdx)
2567 ; SSE-NEXT: movaps %xmm0, 240(%rcx)
2568 ; SSE-NEXT: movaps %xmm1, 224(%rcx)
2569 ; SSE-NEXT: movaps %xmm2, 208(%rcx)
2570 ; SSE-NEXT: movaps %xmm3, 192(%rcx)
2571 ; SSE-NEXT: movaps %xmm5, 176(%rcx)
2572 ; SSE-NEXT: movaps %xmm6, 160(%rcx)
2573 ; SSE-NEXT: movaps %xmm7, 144(%rcx)
2574 ; SSE-NEXT: movaps %xmm8, 128(%rcx)
2575 ; SSE-NEXT: movaps %xmm9, 112(%rcx)
2576 ; SSE-NEXT: movaps %xmm10, 96(%rcx)
2577 ; SSE-NEXT: movaps %xmm11, 80(%rcx)
2578 ; SSE-NEXT: movaps %xmm12, 64(%rcx)
2579 ; SSE-NEXT: movaps %xmm13, 48(%rcx)
2580 ; SSE-NEXT: movaps %xmm15, 32(%rcx)
2581 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2582 ; SSE-NEXT: movaps %xmm0, 16(%rcx)
2583 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2584 ; SSE-NEXT: movaps %xmm0, (%rcx)
2585 ; SSE-NEXT: addq $1112, %rsp # imm = 0x458
2588 ; AVX-LABEL: load_i32_stride3_vf64:
2590 ; AVX-NEXT: subq $1384, %rsp # imm = 0x568
2591 ; AVX-NEXT: vmovaps 544(%rdi), %ymm2
2592 ; AVX-NEXT: vmovaps 512(%rdi), %ymm3
2593 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2594 ; AVX-NEXT: vmovaps 480(%rdi), %ymm4
2595 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2596 ; AVX-NEXT: vmovaps 352(%rdi), %ymm5
2597 ; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2598 ; AVX-NEXT: vmovaps 320(%rdi), %ymm6
2599 ; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2600 ; AVX-NEXT: vmovaps 288(%rdi), %ymm7
2601 ; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2602 ; AVX-NEXT: vmovaps 160(%rdi), %ymm8
2603 ; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2604 ; AVX-NEXT: vmovaps 128(%rdi), %ymm9
2605 ; AVX-NEXT: vmovups %ymm9, (%rsp) # 32-byte Spill
2606 ; AVX-NEXT: vmovaps 96(%rdi), %ymm0
2607 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2608 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6],ymm9[7]
2609 ; AVX-NEXT: vmovaps 112(%rdi), %xmm1
2610 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm9[1,3],ymm1[6,5],ymm9[5,7]
2611 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
2612 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3,0,1]
2613 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2614 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm1[2,0],ymm8[5,4],ymm1[6,4]
2615 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
2616 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
2617 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2618 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7]
2619 ; AVX-NEXT: vmovaps 304(%rdi), %xmm1
2620 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm6[1,3],ymm1[6,5],ymm6[5,7]
2621 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
2622 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm5[2,3,0,1]
2623 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm14[2,0],ymm5[5,4],ymm14[6,4]
2624 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
2625 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
2626 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2627 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
2628 ; AVX-NEXT: vmovaps 496(%rdi), %xmm1
2629 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm3[1,3],ymm1[6,5],ymm3[5,7]
2630 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
2631 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2632 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[2,3,0,1]
2633 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm3[2,0],ymm2[5,4],ymm3[6,4]
2634 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
2635 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
2636 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2637 ; AVX-NEXT: vmovaps 704(%rdi), %ymm2
2638 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2639 ; AVX-NEXT: vmovaps 688(%rdi), %xmm0
2640 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm2[1,3],ymm0[6,5],ymm2[5,7]
2641 ; AVX-NEXT: vmovaps 672(%rdi), %ymm1
2642 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2643 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
2644 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6]
2645 ; AVX-NEXT: vmovaps 736(%rdi), %ymm1
2646 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2647 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm1[2,3,0,1]
2648 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm15[2,0],ymm1[5,4],ymm15[6,4]
2649 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
2650 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
2651 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2652 ; AVX-NEXT: vmovaps 32(%rdi), %ymm7
2653 ; AVX-NEXT: vmovaps 16(%rdi), %xmm0
2654 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm7[1,3],ymm0[6,5],ymm7[5,7]
2655 ; AVX-NEXT: vmovaps (%rdi), %ymm1
2656 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2657 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3],ymm7[4],ymm1[5,6],ymm7[7]
2658 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6]
2659 ; AVX-NEXT: vmovaps 64(%rdi), %ymm4
2660 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3,0,1]
2661 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2662 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm1[2,0],ymm4[5,4],ymm1[6,4]
2663 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2664 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
2665 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
2666 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2667 ; AVX-NEXT: vmovaps 224(%rdi), %ymm6
2668 ; AVX-NEXT: vmovaps 208(%rdi), %xmm0
2669 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm6[1,3],ymm0[6,5],ymm6[5,7]
2670 ; AVX-NEXT: vmovaps 192(%rdi), %ymm1
2671 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2672 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5,6],ymm6[7]
2673 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6]
2674 ; AVX-NEXT: vmovaps 256(%rdi), %ymm5
2675 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3,0,1]
2676 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2677 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm1[2,0],ymm5[5,4],ymm1[6,4]
2678 ; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2679 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
2680 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
2681 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2682 ; AVX-NEXT: vmovaps 416(%rdi), %ymm12
2683 ; AVX-NEXT: vmovaps 400(%rdi), %xmm0
2684 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm12[1,3],ymm0[6,5],ymm12[5,7]
2685 ; AVX-NEXT: vmovaps 384(%rdi), %ymm1
2686 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2687 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2,3],ymm12[4],ymm1[5,6],ymm12[7]
2688 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6]
2689 ; AVX-NEXT: vmovaps 448(%rdi), %ymm8
2690 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm8[2,3,0,1]
2691 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm9[2,0],ymm8[5,4],ymm9[6,4]
2692 ; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2693 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
2694 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
2695 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2696 ; AVX-NEXT: vmovaps 608(%rdi), %ymm10
2697 ; AVX-NEXT: vmovaps 592(%rdi), %xmm0
2698 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm10[1,3],ymm0[6,5],ymm10[5,7]
2699 ; AVX-NEXT: vmovaps 576(%rdi), %ymm1
2700 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2701 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1],ymm1[2,3],ymm10[4],ymm1[5,6],ymm10[7]
2702 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6]
2703 ; AVX-NEXT: vmovaps 640(%rdi), %ymm13
2704 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm13[2,3,0,1]
2705 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,0],ymm11[2,0],ymm13[5,4],ymm11[6,4]
2706 ; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2707 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
2708 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
2709 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2710 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2711 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2712 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4]
2713 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4]
2714 ; AVX-NEXT: vmovaps 112(%rdi), %xmm0
2715 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2716 ; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
2717 ; AVX-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
2718 ; AVX-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
2719 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm0[0,3],ymm1[5,6],ymm0[4,7]
2720 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
2721 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7]
2722 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2723 ; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2724 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2725 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm14[3,0],ymm0[6,4],ymm14[7,4]
2726 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,0],ymm0[2,0],ymm14[4,4],ymm0[6,4]
2727 ; AVX-NEXT: vmovaps 304(%rdi), %xmm2
2728 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2729 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2730 ; AVX-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
2731 ; AVX-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
2732 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm2[0,3],ymm1[5,6],ymm2[4,7]
2733 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
2734 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
2735 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2736 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2737 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2738 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[3,0],ymm0[6,4],ymm3[7,4]
2739 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,0],ymm0[2,0],ymm3[4,4],ymm0[6,4]
2740 ; AVX-NEXT: vmovaps 496(%rdi), %xmm3
2741 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2742 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2743 ; AVX-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
2744 ; AVX-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
2745 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm3[0,3],ymm1[5,6],ymm3[4,7]
2746 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
2747 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
2748 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2749 ; AVX-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2750 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2751 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm15[3,0],ymm0[6,4],ymm15[7,4]
2752 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm15[0,0],ymm0[2,0],ymm15[4,4],ymm0[6,4]
2753 ; AVX-NEXT: vmovaps 688(%rdi), %xmm3
2754 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2755 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2756 ; AVX-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
2757 ; AVX-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
2758 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm3[0,3],ymm1[5,6],ymm3[4,7]
2759 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
2760 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
2761 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2762 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2763 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm4[2,0],ymm1[3,0],ymm4[6,4],ymm1[7,4]
2764 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4]
2765 ; AVX-NEXT: vmovaps 16(%rdi), %xmm4
2766 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2767 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm7[2],ymm0[3,4],ymm7[5],ymm0[6,7]
2768 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,2],ymm4[0,3],ymm2[5,6],ymm4[4,7]
2769 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,3,1,4,6,7,5]
2770 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
2771 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2772 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2773 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,0],ymm2[3,0],ymm5[6,4],ymm2[7,4]
2774 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0],ymm1[2,0],ymm2[4,4],ymm1[6,4]
2775 ; AVX-NEXT: vmovaps 208(%rdi), %xmm5
2776 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2777 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7]
2778 ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,2],ymm5[0,3],ymm3[5,6],ymm5[4,7]
2779 ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,3,1,4,6,7,5]
2780 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
2781 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2782 ; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2783 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm8[2,0],ymm9[3,0],ymm8[6,4],ymm9[7,4]
2784 ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm9[0,0],ymm2[2,0],ymm9[4,4],ymm2[6,4]
2785 ; AVX-NEXT: vmovaps 400(%rdi), %xmm8
2786 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2787 ; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7]
2788 ; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[1,2],ymm8[0,3],ymm15[5,6],ymm8[4,7]
2789 ; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,2,3,1,4,6,7,5]
2790 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5,6,7]
2791 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2792 ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm13[2,0],ymm11[3,0],ymm13[6,4],ymm11[7,4]
2793 ; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm11[0,0],ymm3[2,0],ymm11[4,4],ymm3[6,4]
2794 ; AVX-NEXT: vmovaps 592(%rdi), %xmm9
2795 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2796 ; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm3[0,1],ymm10[2],ymm3[3,4],ymm10[5],ymm3[6,7]
2797 ; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[1,2],ymm9[0,3],ymm14[5,6],ymm9[4,7]
2798 ; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,3,1,4,6,7,5]
2799 ; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm15[5,6,7]
2800 ; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2801 ; AVX-NEXT: vmovups (%rsp), %ymm15 # 32-byte Reload
2802 ; AVX-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload
2803 ; AVX-NEXT: # ymm14 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7]
2804 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
2805 ; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm13[1,0],ymm14[2,0],ymm13[5,4],ymm14[6,4]
2806 ; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm15[0,3],ymm14[6,4],ymm15[4,7]
2807 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
2808 ; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
2809 ; AVX-NEXT: # ymm15 = ymm15[0,1],mem[0,3],ymm15[4,5],mem[4,7]
2810 ; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm15[5,6,7]
2811 ; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1],ymm0[2],ymm7[3,4],ymm0[5],ymm7[6,7]
2812 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,0],ymm14[2,0],ymm4[5,4],ymm14[6,4]
2813 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm7[0,3],ymm0[6,4],ymm7[4,7]
2814 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2815 ; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload
2816 ; AVX-NEXT: # ymm7 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7]
2817 ; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7]
2818 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
2819 ; AVX-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload
2820 ; AVX-NEXT: # ymm0 = ymm14[0,1],mem[2],ymm14[3,4],mem[5],ymm14[6,7]
2821 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2822 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,0],ymm0[2,0],ymm4[5,4],ymm0[6,4]
2823 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm14[0,3],ymm0[6,4],ymm14[4,7]
2824 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2825 ; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm14 # 32-byte Folded Reload
2826 ; AVX-NEXT: # ymm14 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7]
2827 ; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4],ymm14[5,6,7]
2828 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7]
2829 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,0],ymm0[2,0],ymm5[5,4],ymm0[6,4]
2830 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm6[0,3],ymm0[6,4],ymm6[4,7]
2831 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2832 ; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
2833 ; AVX-NEXT: # ymm1 = ymm1[0,1],mem[0,3],ymm1[4,5],mem[4,7]
2834 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
2835 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
2836 ; AVX-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload
2837 ; AVX-NEXT: # ymm1 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7]
2838 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2839 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm1[2,0],ymm4[5,4],ymm1[6,4]
2840 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm6[0,3],ymm1[6,4],ymm6[4,7]
2841 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2842 ; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload
2843 ; AVX-NEXT: # ymm6 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7]
2844 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7]
2845 ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1],ymm2[2],ymm12[3,4],ymm2[5],ymm12[6,7]
2846 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,0],ymm6[2,0],ymm8[5,4],ymm6[6,4]
2847 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm12[0,3],ymm2[6,4],ymm12[4,7]
2848 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2849 ; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload
2850 ; AVX-NEXT: # ymm5 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7]
2851 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7]
2852 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
2853 ; AVX-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload
2854 ; AVX-NEXT: # ymm5 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7]
2855 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2856 ; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm4[1,0],ymm5[2,0],ymm4[5,4],ymm5[6,4]
2857 ; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm6[0,3],ymm5[6,4],ymm6[4,7]
2858 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2859 ; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload
2860 ; AVX-NEXT: # ymm6 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7]
2861 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7]
2862 ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm3[2],ymm10[3,4],ymm3[5],ymm10[6,7]
2863 ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,0],ymm6[2,0],ymm9[5,4],ymm6[6,4]
2864 ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm10[0,3],ymm3[6,4],ymm10[4,7]
2865 ; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload
2866 ; AVX-NEXT: # ymm4 = ymm11[0,1],mem[0,3],ymm11[4,5],mem[4,7]
2867 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7]
2868 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2869 ; AVX-NEXT: vmovaps %ymm4, 192(%rsi)
2870 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2871 ; AVX-NEXT: vmovaps %ymm4, 128(%rsi)
2872 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2873 ; AVX-NEXT: vmovaps %ymm4, 64(%rsi)
2874 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2875 ; AVX-NEXT: vmovaps %ymm4, (%rsi)
2876 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2877 ; AVX-NEXT: vmovaps %ymm4, 224(%rsi)
2878 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2879 ; AVX-NEXT: vmovaps %ymm4, 160(%rsi)
2880 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2881 ; AVX-NEXT: vmovaps %ymm4, 96(%rsi)
2882 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2883 ; AVX-NEXT: vmovaps %ymm4, 32(%rsi)
2884 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2885 ; AVX-NEXT: vmovaps %ymm4, 192(%rdx)
2886 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2887 ; AVX-NEXT: vmovaps %ymm4, 128(%rdx)
2888 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2889 ; AVX-NEXT: vmovaps %ymm4, 64(%rdx)
2890 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2891 ; AVX-NEXT: vmovaps %ymm4, (%rdx)
2892 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2893 ; AVX-NEXT: vmovaps %ymm4, 224(%rdx)
2894 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2895 ; AVX-NEXT: vmovaps %ymm4, 160(%rdx)
2896 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2897 ; AVX-NEXT: vmovaps %ymm4, 96(%rdx)
2898 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2899 ; AVX-NEXT: vmovaps %ymm4, 32(%rdx)
2900 ; AVX-NEXT: vmovaps %ymm3, 192(%rcx)
2901 ; AVX-NEXT: vmovaps %ymm5, 224(%rcx)
2902 ; AVX-NEXT: vmovaps %ymm2, 128(%rcx)
2903 ; AVX-NEXT: vmovaps %ymm1, 160(%rcx)
2904 ; AVX-NEXT: vmovaps %ymm0, 64(%rcx)
2905 ; AVX-NEXT: vmovaps %ymm14, 96(%rcx)
2906 ; AVX-NEXT: vmovaps %ymm7, (%rcx)
2907 ; AVX-NEXT: vmovaps %ymm13, 32(%rcx)
2908 ; AVX-NEXT: addq $1384, %rsp # imm = 0x568
2909 ; AVX-NEXT: vzeroupper
2912 ; AVX2-LABEL: load_i32_stride3_vf64:
2914 ; AVX2-NEXT: subq $1032, %rsp # imm = 0x408
2915 ; AVX2-NEXT: vmovaps 736(%rdi), %ymm2
2916 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2917 ; AVX2-NEXT: vmovaps 704(%rdi), %ymm3
2918 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2919 ; AVX2-NEXT: vmovaps 672(%rdi), %ymm4
2920 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2921 ; AVX2-NEXT: vmovaps 544(%rdi), %ymm5
2922 ; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2923 ; AVX2-NEXT: vmovaps 512(%rdi), %ymm6
2924 ; AVX2-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill
2925 ; AVX2-NEXT: vmovaps 480(%rdi), %ymm7
2926 ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2927 ; AVX2-NEXT: vmovaps 352(%rdi), %ymm8
2928 ; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2929 ; AVX2-NEXT: vmovaps 320(%rdi), %ymm10
2930 ; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2931 ; AVX2-NEXT: vmovaps 288(%rdi), %ymm11
2932 ; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2933 ; AVX2-NEXT: vmovaps 160(%rdi), %ymm9
2934 ; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2935 ; AVX2-NEXT: vmovaps 128(%rdi), %ymm0
2936 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2937 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm1
2938 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2939 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
2940 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7]
2941 ; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,3,6,1,4,7,2,5]
2942 ; AVX2-NEXT: vpermps %ymm1, %ymm0, %ymm1
2943 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2944 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6],ymm10[7]
2945 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7]
2946 ; AVX2-NEXT: vpermps %ymm1, %ymm0, %ymm1
2947 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2948 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7]
2949 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7]
2950 ; AVX2-NEXT: vpermps %ymm1, %ymm0, %ymm1
2951 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2952 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
2953 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
2954 ; AVX2-NEXT: vpermps %ymm1, %ymm0, %ymm1
2955 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2956 ; AVX2-NEXT: vmovaps (%rdi), %ymm1
2957 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2958 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm3
2959 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2960 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm8
2961 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7]
2962 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7]
2963 ; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2964 ; AVX2-NEXT: vpermps %ymm4, %ymm0, %ymm4
2965 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2966 ; AVX2-NEXT: vmovaps 256(%rdi), %ymm1
2967 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2968 ; AVX2-NEXT: vmovaps 224(%rdi), %ymm2
2969 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2970 ; AVX2-NEXT: vmovaps 192(%rdi), %ymm3
2971 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2972 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
2973 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7]
2974 ; AVX2-NEXT: vpermps %ymm6, %ymm0, %ymm6
2975 ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2976 ; AVX2-NEXT: vmovaps 448(%rdi), %ymm13
2977 ; AVX2-NEXT: vmovaps 416(%rdi), %ymm12
2978 ; AVX2-NEXT: vmovaps 384(%rdi), %ymm14
2979 ; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5,6],ymm12[7]
2980 ; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2981 ; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2982 ; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7]
2983 ; AVX2-NEXT: vpermps %ymm10, %ymm0, %ymm10
2984 ; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2985 ; AVX2-NEXT: vmovaps 640(%rdi), %ymm6
2986 ; AVX2-NEXT: vmovaps 608(%rdi), %ymm5
2987 ; AVX2-NEXT: vmovaps 576(%rdi), %ymm7
2988 ; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5,6],ymm5[7]
2989 ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2990 ; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2991 ; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm6[2],ymm15[3,4],ymm6[5],ymm15[6,7]
2992 ; AVX2-NEXT: vpermps %ymm15, %ymm0, %ymm0
2993 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2994 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
2995 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2996 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm4[2],ymm9[3,4],ymm4[5],ymm9[6,7]
2997 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
2998 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0],ymm0[1,2],ymm11[3],ymm0[4,5],ymm11[6],ymm0[7]
2999 ; AVX2-NEXT: vmovaps {{.*#+}} ymm15 = [1,4,7,2,5,0,3,6]
3000 ; AVX2-NEXT: vpermps %ymm0, %ymm15, %ymm0
3001 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3002 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3003 ; AVX2-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3004 ; AVX2-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
3005 ; AVX2-NEXT: vblendps $73, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3006 ; AVX2-NEXT: # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7]
3007 ; AVX2-NEXT: vpermps %ymm0, %ymm15, %ymm0
3008 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3009 ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
3010 ; AVX2-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3011 ; AVX2-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
3012 ; AVX2-NEXT: vblendps $73, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3013 ; AVX2-NEXT: # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7]
3014 ; AVX2-NEXT: vpermps %ymm0, %ymm15, %ymm0
3015 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3016 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3017 ; AVX2-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3018 ; AVX2-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
3019 ; AVX2-NEXT: vblendps $73, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3020 ; AVX2-NEXT: # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7]
3021 ; AVX2-NEXT: vpermps %ymm0, %ymm15, %ymm0
3022 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3023 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3024 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3025 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
3026 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1,2],ymm8[3],ymm0[4,5],ymm8[6],ymm0[7]
3027 ; AVX2-NEXT: vpermps %ymm0, %ymm15, %ymm0
3028 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3029 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
3030 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3031 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm3[2],ymm8[3,4],ymm3[5],ymm8[6,7]
3032 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
3033 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0],ymm0[1,2],ymm10[3],ymm0[4,5],ymm10[6],ymm0[7]
3034 ; AVX2-NEXT: vpermps %ymm0, %ymm15, %ymm0
3035 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3036 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7]
3037 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0],ymm0[1,2],ymm13[3],ymm0[4,5],ymm13[6],ymm0[7]
3038 ; AVX2-NEXT: vmovaps %ymm13, %ymm14
3039 ; AVX2-NEXT: vpermps %ymm0, %ymm15, %ymm13
3040 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7]
3041 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0],ymm0[1,2],ymm6[3],ymm0[4,5],ymm6[6],ymm0[7]
3042 ; AVX2-NEXT: vmovaps %ymm6, %ymm7
3043 ; AVX2-NEXT: vpermps %ymm0, %ymm15, %ymm15
3044 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7]
3045 ; AVX2-NEXT: vmovaps {{.*#+}} ymm12 = [2,5,0,3,6,u,u,u]
3046 ; AVX2-NEXT: vpermps %ymm0, %ymm12, %ymm0
3047 ; AVX2-NEXT: vshufps {{.*#+}} ymm9 = ymm11[0,1,0,3,4,5,4,7]
3048 ; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3]
3049 ; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm9[5,6,7]
3050 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
3051 ; AVX2-NEXT: vpermps %ymm0, %ymm12, %ymm0
3052 ; AVX2-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
3053 ; AVX2-NEXT: # ymm1 = mem[0,1,0,3,4,5,4,7]
3054 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
3055 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
3056 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3057 ; AVX2-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
3058 ; AVX2-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
3059 ; AVX2-NEXT: vpermps %ymm1, %ymm12, %ymm1
3060 ; AVX2-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
3061 ; AVX2-NEXT: # ymm2 = mem[0,1,0,3,4,5,4,7]
3062 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
3063 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
3064 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7]
3065 ; AVX2-NEXT: vpermps %ymm2, %ymm12, %ymm2
3066 ; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,1,0,3,4,5,4,7]
3067 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
3068 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
3069 ; AVX2-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload
3070 ; AVX2-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
3071 ; AVX2-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7]
3072 ; AVX2-NEXT: vpermps %ymm3, %ymm12, %ymm3
3073 ; AVX2-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
3074 ; AVX2-NEXT: # ymm4 = mem[0,1,0,3,4,5,4,7]
3075 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
3076 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7]
3077 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3078 ; AVX2-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
3079 ; AVX2-NEXT: # ymm4 = mem[0,1],ymm4[2],mem[3,4],ymm4[5],mem[6,7]
3080 ; AVX2-NEXT: vpermps %ymm4, %ymm12, %ymm4
3081 ; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm14[0,1,0,3,4,5,4,7]
3082 ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
3083 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7]
3084 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3085 ; AVX2-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
3086 ; AVX2-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3,4],mem[5],ymm5[6,7]
3087 ; AVX2-NEXT: vpermps %ymm5, %ymm12, %ymm5
3088 ; AVX2-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
3089 ; AVX2-NEXT: # ymm6 = mem[0,1,0,3,4,5,4,7]
3090 ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3]
3091 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7]
3092 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
3093 ; AVX2-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
3094 ; AVX2-NEXT: # ymm6 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7]
3095 ; AVX2-NEXT: vpermps %ymm6, %ymm12, %ymm6
3096 ; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,0,3,4,5,4,7]
3097 ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
3098 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7]
3099 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3100 ; AVX2-NEXT: vmovaps %ymm7, 192(%rsi)
3101 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3102 ; AVX2-NEXT: vmovaps %ymm7, 128(%rsi)
3103 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3104 ; AVX2-NEXT: vmovaps %ymm7, 64(%rsi)
3105 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3106 ; AVX2-NEXT: vmovaps %ymm7, (%rsi)
3107 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3108 ; AVX2-NEXT: vmovaps %ymm7, 224(%rsi)
3109 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3110 ; AVX2-NEXT: vmovaps %ymm7, 160(%rsi)
3111 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3112 ; AVX2-NEXT: vmovaps %ymm7, 96(%rsi)
3113 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3114 ; AVX2-NEXT: vmovaps %ymm7, 32(%rsi)
3115 ; AVX2-NEXT: vmovaps %ymm15, 192(%rdx)
3116 ; AVX2-NEXT: vmovaps %ymm13, 128(%rdx)
3117 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3118 ; AVX2-NEXT: vmovaps %ymm7, 64(%rdx)
3119 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3120 ; AVX2-NEXT: vmovaps %ymm7, (%rdx)
3121 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3122 ; AVX2-NEXT: vmovaps %ymm7, 224(%rdx)
3123 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3124 ; AVX2-NEXT: vmovaps %ymm7, 160(%rdx)
3125 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3126 ; AVX2-NEXT: vmovaps %ymm7, 96(%rdx)
3127 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3128 ; AVX2-NEXT: vmovaps %ymm7, 32(%rdx)
3129 ; AVX2-NEXT: vmovaps %ymm6, 192(%rcx)
3130 ; AVX2-NEXT: vmovaps %ymm5, 224(%rcx)
3131 ; AVX2-NEXT: vmovaps %ymm4, 128(%rcx)
3132 ; AVX2-NEXT: vmovaps %ymm3, 160(%rcx)
3133 ; AVX2-NEXT: vmovaps %ymm2, 64(%rcx)
3134 ; AVX2-NEXT: vmovaps %ymm1, 96(%rcx)
3135 ; AVX2-NEXT: vmovaps %ymm0, (%rcx)
3136 ; AVX2-NEXT: vmovaps %ymm9, 32(%rcx)
3137 ; AVX2-NEXT: addq $1032, %rsp # imm = 0x408
3138 ; AVX2-NEXT: vzeroupper
3141 ; AVX2-FP-LABEL: load_i32_stride3_vf64:
3143 ; AVX2-FP-NEXT: subq $1032, %rsp # imm = 0x408
3144 ; AVX2-FP-NEXT: vmovaps 736(%rdi), %ymm2
3145 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3146 ; AVX2-FP-NEXT: vmovaps 704(%rdi), %ymm3
3147 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3148 ; AVX2-FP-NEXT: vmovaps 672(%rdi), %ymm4
3149 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3150 ; AVX2-FP-NEXT: vmovaps 544(%rdi), %ymm5
3151 ; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3152 ; AVX2-FP-NEXT: vmovaps 512(%rdi), %ymm6
3153 ; AVX2-FP-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill
3154 ; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm7
3155 ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3156 ; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm8
3157 ; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3158 ; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm10
3159 ; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3160 ; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm11
3161 ; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3162 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm9
3163 ; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3164 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm0
3165 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3166 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm1
3167 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3168 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
3169 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7]
3170 ; AVX2-FP-NEXT: vmovaps {{.*#+}} ymm0 = [0,3,6,1,4,7,2,5]
3171 ; AVX2-FP-NEXT: vpermps %ymm1, %ymm0, %ymm1
3172 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3173 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6],ymm10[7]
3174 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7]
3175 ; AVX2-FP-NEXT: vpermps %ymm1, %ymm0, %ymm1
3176 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3177 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7]
3178 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7]
3179 ; AVX2-FP-NEXT: vpermps %ymm1, %ymm0, %ymm1
3180 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3181 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
3182 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
3183 ; AVX2-FP-NEXT: vpermps %ymm1, %ymm0, %ymm1
3184 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3185 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm1
3186 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3187 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm3
3188 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3189 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm8
3190 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7]
3191 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7]
3192 ; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3193 ; AVX2-FP-NEXT: vpermps %ymm4, %ymm0, %ymm4
3194 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3195 ; AVX2-FP-NEXT: vmovaps 256(%rdi), %ymm1
3196 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3197 ; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm2
3198 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3199 ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm3
3200 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3201 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
3202 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7]
3203 ; AVX2-FP-NEXT: vpermps %ymm6, %ymm0, %ymm6
3204 ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3205 ; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm13
3206 ; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm12
3207 ; AVX2-FP-NEXT: vmovaps 384(%rdi), %ymm14
3208 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5,6],ymm12[7]
3209 ; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3210 ; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3211 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7]
3212 ; AVX2-FP-NEXT: vpermps %ymm10, %ymm0, %ymm10
3213 ; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3214 ; AVX2-FP-NEXT: vmovaps 640(%rdi), %ymm6
3215 ; AVX2-FP-NEXT: vmovaps 608(%rdi), %ymm5
3216 ; AVX2-FP-NEXT: vmovaps 576(%rdi), %ymm7
3217 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5,6],ymm5[7]
3218 ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3219 ; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3220 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm6[2],ymm15[3,4],ymm6[5],ymm15[6,7]
3221 ; AVX2-FP-NEXT: vpermps %ymm15, %ymm0, %ymm0
3222 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3223 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
3224 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3225 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm4[2],ymm9[3,4],ymm4[5],ymm9[6,7]
3226 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
3227 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0],ymm0[1,2],ymm11[3],ymm0[4,5],ymm11[6],ymm0[7]
3228 ; AVX2-FP-NEXT: vmovaps {{.*#+}} ymm15 = [1,4,7,2,5,0,3,6]
3229 ; AVX2-FP-NEXT: vpermps %ymm0, %ymm15, %ymm0
3230 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3231 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3232 ; AVX2-FP-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3233 ; AVX2-FP-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
3234 ; AVX2-FP-NEXT: vblendps $73, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3235 ; AVX2-FP-NEXT: # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7]
3236 ; AVX2-FP-NEXT: vpermps %ymm0, %ymm15, %ymm0
3237 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3238 ; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
3239 ; AVX2-FP-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3240 ; AVX2-FP-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
3241 ; AVX2-FP-NEXT: vblendps $73, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3242 ; AVX2-FP-NEXT: # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7]
3243 ; AVX2-FP-NEXT: vpermps %ymm0, %ymm15, %ymm0
3244 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3245 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3246 ; AVX2-FP-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3247 ; AVX2-FP-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
3248 ; AVX2-FP-NEXT: vblendps $73, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3249 ; AVX2-FP-NEXT: # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7]
3250 ; AVX2-FP-NEXT: vpermps %ymm0, %ymm15, %ymm0
3251 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3252 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3253 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3254 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
3255 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1,2],ymm8[3],ymm0[4,5],ymm8[6],ymm0[7]
3256 ; AVX2-FP-NEXT: vpermps %ymm0, %ymm15, %ymm0
3257 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3258 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
3259 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3260 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm3[2],ymm8[3,4],ymm3[5],ymm8[6,7]
3261 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
3262 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0],ymm0[1,2],ymm10[3],ymm0[4,5],ymm10[6],ymm0[7]
3263 ; AVX2-FP-NEXT: vpermps %ymm0, %ymm15, %ymm0
3264 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3265 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7]
3266 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0],ymm0[1,2],ymm13[3],ymm0[4,5],ymm13[6],ymm0[7]
3267 ; AVX2-FP-NEXT: vmovaps %ymm13, %ymm14
3268 ; AVX2-FP-NEXT: vpermps %ymm0, %ymm15, %ymm13
3269 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7]
3270 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0],ymm0[1,2],ymm6[3],ymm0[4,5],ymm6[6],ymm0[7]
3271 ; AVX2-FP-NEXT: vmovaps %ymm6, %ymm7
3272 ; AVX2-FP-NEXT: vpermps %ymm0, %ymm15, %ymm15
3273 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7]
3274 ; AVX2-FP-NEXT: vmovaps {{.*#+}} ymm12 = [2,5,0,3,6,u,u,u]
3275 ; AVX2-FP-NEXT: vpermps %ymm0, %ymm12, %ymm0
3276 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm9 = ymm11[0,1,0,3,4,5,4,7]
3277 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3]
3278 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm9[5,6,7]
3279 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
3280 ; AVX2-FP-NEXT: vpermps %ymm0, %ymm12, %ymm0
3281 ; AVX2-FP-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
3282 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,0,3,4,5,4,7]
3283 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
3284 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
3285 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3286 ; AVX2-FP-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
3287 ; AVX2-FP-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
3288 ; AVX2-FP-NEXT: vpermps %ymm1, %ymm12, %ymm1
3289 ; AVX2-FP-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
3290 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,0,3,4,5,4,7]
3291 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
3292 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
3293 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7]
3294 ; AVX2-FP-NEXT: vpermps %ymm2, %ymm12, %ymm2
3295 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,1,0,3,4,5,4,7]
3296 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
3297 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
3298 ; AVX2-FP-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload
3299 ; AVX2-FP-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
3300 ; AVX2-FP-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7]
3301 ; AVX2-FP-NEXT: vpermps %ymm3, %ymm12, %ymm3
3302 ; AVX2-FP-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
3303 ; AVX2-FP-NEXT: # ymm4 = mem[0,1,0,3,4,5,4,7]
3304 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
3305 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7]
3306 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3307 ; AVX2-FP-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
3308 ; AVX2-FP-NEXT: # ymm4 = mem[0,1],ymm4[2],mem[3,4],ymm4[5],mem[6,7]
3309 ; AVX2-FP-NEXT: vpermps %ymm4, %ymm12, %ymm4
3310 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm14[0,1,0,3,4,5,4,7]
3311 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
3312 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7]
3313 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3314 ; AVX2-FP-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
3315 ; AVX2-FP-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3,4],mem[5],ymm5[6,7]
3316 ; AVX2-FP-NEXT: vpermps %ymm5, %ymm12, %ymm5
3317 ; AVX2-FP-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
3318 ; AVX2-FP-NEXT: # ymm6 = mem[0,1,0,3,4,5,4,7]
3319 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3]
3320 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7]
3321 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
3322 ; AVX2-FP-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
3323 ; AVX2-FP-NEXT: # ymm6 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7]
3324 ; AVX2-FP-NEXT: vpermps %ymm6, %ymm12, %ymm6
3325 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,0,3,4,5,4,7]
3326 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
3327 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7]
3328 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3329 ; AVX2-FP-NEXT: vmovaps %ymm7, 192(%rsi)
3330 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3331 ; AVX2-FP-NEXT: vmovaps %ymm7, 128(%rsi)
3332 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3333 ; AVX2-FP-NEXT: vmovaps %ymm7, 64(%rsi)
3334 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3335 ; AVX2-FP-NEXT: vmovaps %ymm7, (%rsi)
3336 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3337 ; AVX2-FP-NEXT: vmovaps %ymm7, 224(%rsi)
3338 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3339 ; AVX2-FP-NEXT: vmovaps %ymm7, 160(%rsi)
3340 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3341 ; AVX2-FP-NEXT: vmovaps %ymm7, 96(%rsi)
3342 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3343 ; AVX2-FP-NEXT: vmovaps %ymm7, 32(%rsi)
3344 ; AVX2-FP-NEXT: vmovaps %ymm15, 192(%rdx)
3345 ; AVX2-FP-NEXT: vmovaps %ymm13, 128(%rdx)
3346 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3347 ; AVX2-FP-NEXT: vmovaps %ymm7, 64(%rdx)
3348 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3349 ; AVX2-FP-NEXT: vmovaps %ymm7, (%rdx)
3350 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3351 ; AVX2-FP-NEXT: vmovaps %ymm7, 224(%rdx)
3352 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3353 ; AVX2-FP-NEXT: vmovaps %ymm7, 160(%rdx)
3354 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3355 ; AVX2-FP-NEXT: vmovaps %ymm7, 96(%rdx)
3356 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3357 ; AVX2-FP-NEXT: vmovaps %ymm7, 32(%rdx)
3358 ; AVX2-FP-NEXT: vmovaps %ymm6, 192(%rcx)
3359 ; AVX2-FP-NEXT: vmovaps %ymm5, 224(%rcx)
3360 ; AVX2-FP-NEXT: vmovaps %ymm4, 128(%rcx)
3361 ; AVX2-FP-NEXT: vmovaps %ymm3, 160(%rcx)
3362 ; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rcx)
3363 ; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rcx)
3364 ; AVX2-FP-NEXT: vmovaps %ymm0, (%rcx)
3365 ; AVX2-FP-NEXT: vmovaps %ymm9, 32(%rcx)
3366 ; AVX2-FP-NEXT: addq $1032, %rsp # imm = 0x408
3367 ; AVX2-FP-NEXT: vzeroupper
3368 ; AVX2-FP-NEXT: retq
3370 ; AVX2-FCP-LABEL: load_i32_stride3_vf64:
3371 ; AVX2-FCP: # %bb.0:
3372 ; AVX2-FCP-NEXT: subq $1032, %rsp # imm = 0x408
3373 ; AVX2-FCP-NEXT: vmovaps 736(%rdi), %ymm2
3374 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3375 ; AVX2-FCP-NEXT: vmovaps 704(%rdi), %ymm3
3376 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3377 ; AVX2-FCP-NEXT: vmovaps 672(%rdi), %ymm4
3378 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3379 ; AVX2-FCP-NEXT: vmovaps 544(%rdi), %ymm5
3380 ; AVX2-FCP-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill
3381 ; AVX2-FCP-NEXT: vmovaps 512(%rdi), %ymm6
3382 ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3383 ; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm7
3384 ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3385 ; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm8
3386 ; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3387 ; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm9
3388 ; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm10
3389 ; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3390 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm11
3391 ; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3392 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm14
3393 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm13
3394 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6],ymm14[7]
3395 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7]
3396 ; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm0 = [0,3,6,1,4,7,2,5]
3397 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm1
3398 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3399 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6],ymm9[7]
3400 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7]
3401 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm1
3402 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3403 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7]
3404 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7]
3405 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm1
3406 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3407 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
3408 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
3409 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm1
3410 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3411 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm5
3412 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm1
3413 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3414 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm3
3415 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3416 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6],ymm1[7]
3417 ; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3418 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7]
3419 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm1
3420 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3421 ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm2
3422 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3423 ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm4
3424 ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm1
3425 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3426 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5,6],ymm4[7]
3427 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3428 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
3429 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm1
3430 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3431 ; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm1
3432 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3433 ; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm2
3434 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3435 ; AVX2-FCP-NEXT: vmovaps 384(%rdi), %ymm3
3436 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3437 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
3438 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7]
3439 ; AVX2-FCP-NEXT: vpermps %ymm3, %ymm0, %ymm1
3440 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3441 ; AVX2-FCP-NEXT: vmovaps 640(%rdi), %ymm8
3442 ; AVX2-FCP-NEXT: vmovaps 608(%rdi), %ymm7
3443 ; AVX2-FCP-NEXT: vmovaps 576(%rdi), %ymm11
3444 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6],ymm7[7]
3445 ; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3446 ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3447 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm8[2],ymm15[3,4],ymm8[5],ymm15[6,7]
3448 ; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3449 ; AVX2-FCP-NEXT: vpermps %ymm15, %ymm0, %ymm0
3450 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3451 ; AVX2-FCP-NEXT: vmovaps %ymm14, %ymm10
3452 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7]
3453 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
3454 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0],ymm0[1,2],ymm12[3],ymm0[4,5],ymm12[6],ymm0[7]
3455 ; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm15 = [1,4,7,2,5,0,3,6]
3456 ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm15, %ymm0
3457 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3458 ; AVX2-FCP-NEXT: vmovaps %ymm9, %ymm14
3459 ; AVX2-FCP-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload
3460 ; AVX2-FCP-NEXT: # ymm0 = mem[0,1],ymm9[2],mem[3,4],ymm9[5],mem[6,7]
3461 ; AVX2-FCP-NEXT: vblendps $73, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3462 ; AVX2-FCP-NEXT: # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7]
3463 ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm15, %ymm0
3464 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3465 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3466 ; AVX2-FCP-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3467 ; AVX2-FCP-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
3468 ; AVX2-FCP-NEXT: vblendps $73, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
3469 ; AVX2-FCP-NEXT: # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7]
3470 ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm15, %ymm0
3471 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3472 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3473 ; AVX2-FCP-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3474 ; AVX2-FCP-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
3475 ; AVX2-FCP-NEXT: vblendps $73, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3476 ; AVX2-FCP-NEXT: # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7]
3477 ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm15, %ymm0
3478 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3479 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3480 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7]
3481 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3482 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7]
3483 ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm15, %ymm0
3484 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3485 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3486 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
3487 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3488 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7]
3489 ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm15, %ymm0
3490 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3491 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
3492 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3493 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7]
3494 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
3495 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0],ymm0[1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7]
3496 ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm15, %ymm0
3497 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3498 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7]
3499 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1,2],ymm8[3],ymm0[4,5],ymm8[6],ymm0[7]
3500 ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm15, %ymm11
3501 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7]
3502 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5,6],ymm12[7]
3503 ; AVX2-FCP-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload
3504 ; AVX2-FCP-NEXT: # ymm8 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
3505 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm8[0],ymm2[1],ymm8[2,3],ymm2[4],ymm8[5,6],ymm2[7]
3506 ; AVX2-FCP-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload
3507 ; AVX2-FCP-NEXT: # ymm8 = ymm14[0,1],mem[2],ymm14[3,4],mem[5],ymm14[6,7]
3508 ; AVX2-FCP-NEXT: vblendps $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
3509 ; AVX2-FCP-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5,6],mem[7]
3510 ; AVX2-FCP-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload
3511 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7]
3512 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7]
3513 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3514 ; AVX2-FCP-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
3515 ; AVX2-FCP-NEXT: # ymm7 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
3516 ; AVX2-FCP-NEXT: vblendps $146, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
3517 ; AVX2-FCP-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5,6],mem[7]
3518 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7]
3519 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3],ymm9[4],ymm1[5,6],ymm9[7]
3520 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3521 ; AVX2-FCP-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload
3522 ; AVX2-FCP-NEXT: # ymm6 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7]
3523 ; AVX2-FCP-NEXT: vblendps $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
3524 ; AVX2-FCP-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3],mem[4],ymm6[5,6],mem[7]
3525 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3526 ; AVX2-FCP-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
3527 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7]
3528 ; AVX2-FCP-NEXT: vblendps $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
3529 ; AVX2-FCP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5,6],mem[7]
3530 ; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm5 = [2,5,0,3,6,1,4,7]
3531 ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm5, %ymm0
3532 ; AVX2-FCP-NEXT: vpermps %ymm15, %ymm5, %ymm4
3533 ; AVX2-FCP-NEXT: vpermps %ymm8, %ymm5, %ymm8
3534 ; AVX2-FCP-NEXT: vpermps %ymm2, %ymm5, %ymm2
3535 ; AVX2-FCP-NEXT: vpermps %ymm7, %ymm5, %ymm7
3536 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm1
3537 ; AVX2-FCP-NEXT: vpermps %ymm6, %ymm5, %ymm6
3538 ; AVX2-FCP-NEXT: vpermps %ymm3, %ymm5, %ymm3
3539 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3540 ; AVX2-FCP-NEXT: vmovaps %ymm5, 192(%rsi)
3541 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3542 ; AVX2-FCP-NEXT: vmovaps %ymm5, 128(%rsi)
3543 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3544 ; AVX2-FCP-NEXT: vmovaps %ymm5, 64(%rsi)
3545 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3546 ; AVX2-FCP-NEXT: vmovaps %ymm5, (%rsi)
3547 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3548 ; AVX2-FCP-NEXT: vmovaps %ymm5, 224(%rsi)
3549 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3550 ; AVX2-FCP-NEXT: vmovaps %ymm5, 160(%rsi)
3551 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3552 ; AVX2-FCP-NEXT: vmovaps %ymm5, 96(%rsi)
3553 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3554 ; AVX2-FCP-NEXT: vmovaps %ymm5, 32(%rsi)
3555 ; AVX2-FCP-NEXT: vmovaps %ymm11, 192(%rdx)
3556 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3557 ; AVX2-FCP-NEXT: vmovaps %ymm5, 128(%rdx)
3558 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3559 ; AVX2-FCP-NEXT: vmovaps %ymm5, 64(%rdx)
3560 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3561 ; AVX2-FCP-NEXT: vmovaps %ymm5, (%rdx)
3562 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3563 ; AVX2-FCP-NEXT: vmovaps %ymm5, 224(%rdx)
3564 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3565 ; AVX2-FCP-NEXT: vmovaps %ymm5, 160(%rdx)
3566 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3567 ; AVX2-FCP-NEXT: vmovaps %ymm5, 96(%rdx)
3568 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3569 ; AVX2-FCP-NEXT: vmovaps %ymm5, 32(%rdx)
3570 ; AVX2-FCP-NEXT: vmovaps %ymm3, 192(%rcx)
3571 ; AVX2-FCP-NEXT: vmovaps %ymm6, 224(%rcx)
3572 ; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rcx)
3573 ; AVX2-FCP-NEXT: vmovaps %ymm7, 160(%rcx)
3574 ; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rcx)
3575 ; AVX2-FCP-NEXT: vmovaps %ymm8, 96(%rcx)
3576 ; AVX2-FCP-NEXT: vmovaps %ymm4, (%rcx)
3577 ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rcx)
3578 ; AVX2-FCP-NEXT: addq $1032, %rsp # imm = 0x408
3579 ; AVX2-FCP-NEXT: vzeroupper
3580 ; AVX2-FCP-NEXT: retq
3582 ; AVX512-LABEL: load_i32_stride3_vf64:
3584 ; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm4
3585 ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm5
3586 ; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm0
3587 ; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm6
3588 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm7
3589 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm1
3590 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm8
3591 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm9
3592 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2
3593 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm10
3594 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm11
3595 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3
3596 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
3597 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13
3598 ; AVX512-NEXT: vpermt2d %zmm9, %zmm12, %zmm13
3599 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
3600 ; AVX512-NEXT: vpermt2d %zmm8, %zmm14, %zmm13
3601 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm15
3602 ; AVX512-NEXT: vpermt2d %zmm7, %zmm12, %zmm15
3603 ; AVX512-NEXT: vpermt2d %zmm6, %zmm14, %zmm15
3604 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm16
3605 ; AVX512-NEXT: vpermt2d %zmm5, %zmm12, %zmm16
3606 ; AVX512-NEXT: vpermt2d %zmm4, %zmm14, %zmm16
3607 ; AVX512-NEXT: vpermi2d %zmm10, %zmm2, %zmm12
3608 ; AVX512-NEXT: vpermt2d %zmm11, %zmm14, %zmm12
3609 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
3610 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm17
3611 ; AVX512-NEXT: vpermt2d %zmm1, %zmm14, %zmm17
3612 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
3613 ; AVX512-NEXT: vpermt2d %zmm6, %zmm18, %zmm17
3614 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm19
3615 ; AVX512-NEXT: vpermt2d %zmm3, %zmm14, %zmm19
3616 ; AVX512-NEXT: vpermt2d %zmm8, %zmm18, %zmm19
3617 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm20
3618 ; AVX512-NEXT: vpermt2d %zmm0, %zmm14, %zmm20
3619 ; AVX512-NEXT: vpermt2d %zmm4, %zmm18, %zmm20
3620 ; AVX512-NEXT: vpermi2d %zmm2, %zmm10, %zmm14
3621 ; AVX512-NEXT: vpermt2d %zmm11, %zmm18, %zmm14
3622 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
3623 ; AVX512-NEXT: vpermt2d %zmm9, %zmm18, %zmm3
3624 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
3625 ; AVX512-NEXT: vpermt2d %zmm8, %zmm9, %zmm3
3626 ; AVX512-NEXT: vpermt2d %zmm5, %zmm18, %zmm0
3627 ; AVX512-NEXT: vpermt2d %zmm4, %zmm9, %zmm0
3628 ; AVX512-NEXT: vpermt2d %zmm7, %zmm18, %zmm1
3629 ; AVX512-NEXT: vpermt2d %zmm6, %zmm9, %zmm1
3630 ; AVX512-NEXT: vpermt2d %zmm10, %zmm18, %zmm2
3631 ; AVX512-NEXT: vpermt2d %zmm11, %zmm9, %zmm2
3632 ; AVX512-NEXT: vmovdqa64 %zmm16, 192(%rsi)
3633 ; AVX512-NEXT: vmovdqa64 %zmm15, 128(%rsi)
3634 ; AVX512-NEXT: vmovdqa64 %zmm13, 64(%rsi)
3635 ; AVX512-NEXT: vmovdqa64 %zmm12, (%rsi)
3636 ; AVX512-NEXT: vmovdqa64 %zmm20, 192(%rdx)
3637 ; AVX512-NEXT: vmovdqa64 %zmm14, (%rdx)
3638 ; AVX512-NEXT: vmovdqa64 %zmm19, 64(%rdx)
3639 ; AVX512-NEXT: vmovdqa64 %zmm17, 128(%rdx)
3640 ; AVX512-NEXT: vmovdqa64 %zmm1, 128(%rcx)
3641 ; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rcx)
3642 ; AVX512-NEXT: vmovdqa64 %zmm2, (%rcx)
3643 ; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rcx)
3644 ; AVX512-NEXT: vzeroupper
3647 ; AVX512-FCP-LABEL: load_i32_stride3_vf64:
3648 ; AVX512-FCP: # %bb.0:
3649 ; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm4
3650 ; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm5
3651 ; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0
3652 ; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm6
3653 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7
3654 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1
3655 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm8
3656 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm9
3657 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
3658 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm10
3659 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm11
3660 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
3661 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
3662 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm13
3663 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm12, %zmm13
3664 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
3665 ; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm14, %zmm13
3666 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm15
3667 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm15
3668 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm14, %zmm15
3669 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm16
3670 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm12, %zmm16
3671 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm16
3672 ; AVX512-FCP-NEXT: vpermi2d %zmm10, %zmm2, %zmm12
3673 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm12
3674 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
3675 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm17
3676 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm17
3677 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
3678 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm18, %zmm17
3679 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm19
3680 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm19
3681 ; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm18, %zmm19
3682 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm20
3683 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm20
3684 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm18, %zmm20
3685 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm10, %zmm14
3686 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm18, %zmm14
3687 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
3688 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm3
3689 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
3690 ; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm9, %zmm3
3691 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm18, %zmm0
3692 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm9, %zmm0
3693 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm18, %zmm1
3694 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm9, %zmm1
3695 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm18, %zmm2
3696 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm9, %zmm2
3697 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 192(%rsi)
3698 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 128(%rsi)
3699 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 64(%rsi)
3700 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, (%rsi)
3701 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 192(%rdx)
3702 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, (%rdx)
3703 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 64(%rdx)
3704 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 128(%rdx)
3705 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 128(%rcx)
3706 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 192(%rcx)
3707 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rcx)
3708 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%rcx)
3709 ; AVX512-FCP-NEXT: vzeroupper
3710 ; AVX512-FCP-NEXT: retq
3712 ; AVX512DQ-LABEL: load_i32_stride3_vf64:
3713 ; AVX512DQ: # %bb.0:
3714 ; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm4
3715 ; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm5
3716 ; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm0
3717 ; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm6
3718 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm7
3719 ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm1
3720 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm8
3721 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm9
3722 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2
3723 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm10
3724 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm11
3725 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3
3726 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
3727 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm13
3728 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm12, %zmm13
3729 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
3730 ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm14, %zmm13
3731 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm15
3732 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm12, %zmm15
3733 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm14, %zmm15
3734 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm16
3735 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm12, %zmm16
3736 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm14, %zmm16
3737 ; AVX512DQ-NEXT: vpermi2d %zmm10, %zmm2, %zmm12
3738 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm14, %zmm12
3739 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
3740 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm17
3741 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm14, %zmm17
3742 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
3743 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm18, %zmm17
3744 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm19
3745 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm14, %zmm19
3746 ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm18, %zmm19
3747 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm20
3748 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm14, %zmm20
3749 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm18, %zmm20
3750 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm10, %zmm14
3751 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm18, %zmm14
3752 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
3753 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm18, %zmm3
3754 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
3755 ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm9, %zmm3
3756 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm18, %zmm0
3757 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm9, %zmm0
3758 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm18, %zmm1
3759 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm9, %zmm1
3760 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm18, %zmm2
3761 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm9, %zmm2
3762 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 192(%rsi)
3763 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, 128(%rsi)
3764 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 64(%rsi)
3765 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, (%rsi)
3766 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, 192(%rdx)
3767 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, (%rdx)
3768 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, 64(%rdx)
3769 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, 128(%rdx)
3770 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 128(%rcx)
3771 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 192(%rcx)
3772 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rcx)
3773 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%rcx)
3774 ; AVX512DQ-NEXT: vzeroupper
3775 ; AVX512DQ-NEXT: retq
3777 ; AVX512DQ-FCP-LABEL: load_i32_stride3_vf64:
3778 ; AVX512DQ-FCP: # %bb.0:
3779 ; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm4
3780 ; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm5
3781 ; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0
3782 ; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm6
3783 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7
3784 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1
3785 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm8
3786 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm9
3787 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
3788 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm10
3789 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm11
3790 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
3791 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
3792 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm13
3793 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm12, %zmm13
3794 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
3795 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm14, %zmm13
3796 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm15
3797 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm15
3798 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm14, %zmm15
3799 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm16
3800 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm12, %zmm16
3801 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm16
3802 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm10, %zmm2, %zmm12
3803 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm12
3804 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
3805 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm17
3806 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm17
3807 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
3808 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm18, %zmm17
3809 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm19
3810 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm19
3811 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm18, %zmm19
3812 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm20
3813 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm20
3814 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm18, %zmm20
3815 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm10, %zmm14
3816 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm18, %zmm14
3817 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
3818 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm3
3819 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
3820 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm9, %zmm3
3821 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm18, %zmm0
3822 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm9, %zmm0
3823 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm18, %zmm1
3824 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm9, %zmm1
3825 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm18, %zmm2
3826 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm9, %zmm2
3827 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 192(%rsi)
3828 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 128(%rsi)
3829 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 64(%rsi)
3830 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, (%rsi)
3831 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 192(%rdx)
3832 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, (%rdx)
3833 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 64(%rdx)
3834 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 128(%rdx)
3835 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 128(%rcx)
3836 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 192(%rcx)
3837 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rcx)
3838 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%rcx)
3839 ; AVX512DQ-FCP-NEXT: vzeroupper
3840 ; AVX512DQ-FCP-NEXT: retq
3842 ; AVX512BW-LABEL: load_i32_stride3_vf64:
3843 ; AVX512BW: # %bb.0:
3844 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm4
3845 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm5
3846 ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm0
3847 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm6
3848 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm7
3849 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm1
3850 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm8
3851 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm9
3852 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2
3853 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm10
3854 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm11
3855 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3
3856 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
3857 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13
3858 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm12, %zmm13
3859 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
3860 ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm14, %zmm13
3861 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15
3862 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm15
3863 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm14, %zmm15
3864 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16
3865 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm12, %zmm16
3866 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm14, %zmm16
3867 ; AVX512BW-NEXT: vpermi2d %zmm10, %zmm2, %zmm12
3868 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm14, %zmm12
3869 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
3870 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm17
3871 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm14, %zmm17
3872 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
3873 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm18, %zmm17
3874 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm19
3875 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm14, %zmm19
3876 ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm18, %zmm19
3877 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm20
3878 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm14, %zmm20
3879 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm18, %zmm20
3880 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm10, %zmm14
3881 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm18, %zmm14
3882 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
3883 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm18, %zmm3
3884 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
3885 ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm9, %zmm3
3886 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm18, %zmm0
3887 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm9, %zmm0
3888 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm18, %zmm1
3889 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm9, %zmm1
3890 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm18, %zmm2
3891 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm9, %zmm2
3892 ; AVX512BW-NEXT: vmovdqa64 %zmm16, 192(%rsi)
3893 ; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%rsi)
3894 ; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rsi)
3895 ; AVX512BW-NEXT: vmovdqa64 %zmm12, (%rsi)
3896 ; AVX512BW-NEXT: vmovdqa64 %zmm20, 192(%rdx)
3897 ; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rdx)
3898 ; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%rdx)
3899 ; AVX512BW-NEXT: vmovdqa64 %zmm17, 128(%rdx)
3900 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 128(%rcx)
3901 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%rcx)
3902 ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rcx)
3903 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rcx)
3904 ; AVX512BW-NEXT: vzeroupper
3905 ; AVX512BW-NEXT: retq
3907 ; AVX512BW-FCP-LABEL: load_i32_stride3_vf64:
3908 ; AVX512BW-FCP: # %bb.0:
3909 ; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm4
3910 ; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm5
3911 ; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0
3912 ; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm6
3913 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7
3914 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1
3915 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm8
3916 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm9
3917 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
3918 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm10
3919 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm11
3920 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
3921 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
3922 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13
3923 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm12, %zmm13
3924 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
3925 ; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm14, %zmm13
3926 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm15
3927 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm15
3928 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm14, %zmm15
3929 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16
3930 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm12, %zmm16
3931 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm16
3932 ; AVX512BW-FCP-NEXT: vpermi2d %zmm10, %zmm2, %zmm12
3933 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm12
3934 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
3935 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm17
3936 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm17
3937 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
3938 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm18, %zmm17
3939 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm19
3940 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm19
3941 ; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm18, %zmm19
3942 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm20
3943 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm20
3944 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm18, %zmm20
3945 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm10, %zmm14
3946 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm18, %zmm14
3947 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
3948 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm3
3949 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
3950 ; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm9, %zmm3
3951 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm18, %zmm0
3952 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm9, %zmm0
3953 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm18, %zmm1
3954 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm9, %zmm1
3955 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm18, %zmm2
3956 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm9, %zmm2
3957 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 192(%rsi)
3958 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 128(%rsi)
3959 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rsi)
3960 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%rsi)
3961 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 192(%rdx)
3962 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%rdx)
3963 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rdx)
3964 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 128(%rdx)
3965 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%rcx)
3966 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%rcx)
3967 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rcx)
3968 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rcx)
3969 ; AVX512BW-FCP-NEXT: vzeroupper
3970 ; AVX512BW-FCP-NEXT: retq
3972 ; AVX512DQ-BW-LABEL: load_i32_stride3_vf64:
3973 ; AVX512DQ-BW: # %bb.0:
3974 ; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm4
3975 ; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm5
3976 ; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm0
3977 ; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm6
3978 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm7
3979 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm1
3980 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm8
3981 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm9
3982 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2
3983 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm10
3984 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm11
3985 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3
3986 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
3987 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm13
3988 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm12, %zmm13
3989 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
3990 ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm14, %zmm13
3991 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm15
3992 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm15
3993 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm14, %zmm15
3994 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16
3995 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm12, %zmm16
3996 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm14, %zmm16
3997 ; AVX512DQ-BW-NEXT: vpermi2d %zmm10, %zmm2, %zmm12
3998 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm14, %zmm12
3999 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
4000 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm17
4001 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm14, %zmm17
4002 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
4003 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm18, %zmm17
4004 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm19
4005 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm14, %zmm19
4006 ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm18, %zmm19
4007 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm20
4008 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm14, %zmm20
4009 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm18, %zmm20
4010 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm10, %zmm14
4011 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm18, %zmm14
4012 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
4013 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm18, %zmm3
4014 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
4015 ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm9, %zmm3
4016 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm18, %zmm0
4017 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm9, %zmm0
4018 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm18, %zmm1
4019 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm9, %zmm1
4020 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm18, %zmm2
4021 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm9, %zmm2
4022 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 192(%rsi)
4023 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 128(%rsi)
4024 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 64(%rsi)
4025 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, (%rsi)
4026 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 192(%rdx)
4027 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, (%rdx)
4028 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 64(%rdx)
4029 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 128(%rdx)
4030 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 128(%rcx)
4031 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 192(%rcx)
4032 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rcx)
4033 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 64(%rcx)
4034 ; AVX512DQ-BW-NEXT: vzeroupper
4035 ; AVX512DQ-BW-NEXT: retq
4037 ; AVX512DQ-BW-FCP-LABEL: load_i32_stride3_vf64:
4038 ; AVX512DQ-BW-FCP: # %bb.0:
4039 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm4
4040 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm5
4041 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0
4042 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm6
4043 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7
4044 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1
4045 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm8
4046 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm9
4047 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
4048 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm10
4049 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm11
4050 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
4051 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0]
4052 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13
4053 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm12, %zmm13
4054 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
4055 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm14, %zmm13
4056 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm15
4057 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm15
4058 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm14, %zmm15
4059 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16
4060 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm12, %zmm16
4061 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm16
4062 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm10, %zmm2, %zmm12
4063 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm12
4064 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0]
4065 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm17
4066 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm17
4067 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
4068 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm18, %zmm17
4069 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm19
4070 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm19
4071 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm18, %zmm19
4072 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm20
4073 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm20
4074 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm18, %zmm20
4075 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm10, %zmm14
4076 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm18, %zmm14
4077 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0]
4078 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm3
4079 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
4080 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm9, %zmm3
4081 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm18, %zmm0
4082 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm9, %zmm0
4083 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm18, %zmm1
4084 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm9, %zmm1
4085 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm18, %zmm2
4086 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm9, %zmm2
4087 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 192(%rsi)
4088 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 128(%rsi)
4089 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rsi)
4090 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%rsi)
4091 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 192(%rdx)
4092 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%rdx)
4093 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rdx)
4094 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 128(%rdx)
4095 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%rcx)
4096 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%rcx)
4097 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rcx)
4098 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rcx)
4099 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
4100 ; AVX512DQ-BW-FCP-NEXT: retq
4101 %wide.vec = load <192 x i32>, ptr %in.vec, align 64
4102 %strided.vec0 = shufflevector <192 x i32> %wide.vec, <192 x i32> poison, <64 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45, i32 48, i32 51, i32 54, i32 57, i32 60, i32 63, i32 66, i32 69, i32 72, i32 75, i32 78, i32 81, i32 84, i32 87, i32 90, i32 93, i32 96, i32 99, i32 102, i32 105, i32 108, i32 111, i32 114, i32 117, i32 120, i32 123, i32 126, i32 129, i32 132, i32 135, i32 138, i32 141, i32 144, i32 147, i32 150, i32 153, i32 156, i32 159, i32 162, i32 165, i32 168, i32 171, i32 174, i32 177, i32 180, i32 183, i32 186, i32 189>
4103 %strided.vec1 = shufflevector <192 x i32> %wide.vec, <192 x i32> poison, <64 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46, i32 49, i32 52, i32 55, i32 58, i32 61, i32 64, i32 67, i32 70, i32 73, i32 76, i32 79, i32 82, i32 85, i32 88, i32 91, i32 94, i32 97, i32 100, i32 103, i32 106, i32 109, i32 112, i32 115, i32 118, i32 121, i32 124, i32 127, i32 130, i32 133, i32 136, i32 139, i32 142, i32 145, i32 148, i32 151, i32 154, i32 157, i32 160, i32 163, i32 166, i32 169, i32 172, i32 175, i32 178, i32 181, i32 184, i32 187, i32 190>
4104 %strided.vec2 = shufflevector <192 x i32> %wide.vec, <192 x i32> poison, <64 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47, i32 50, i32 53, i32 56, i32 59, i32 62, i32 65, i32 68, i32 71, i32 74, i32 77, i32 80, i32 83, i32 86, i32 89, i32 92, i32 95, i32 98, i32 101, i32 104, i32 107, i32 110, i32 113, i32 116, i32 119, i32 122, i32 125, i32 128, i32 131, i32 134, i32 137, i32 140, i32 143, i32 146, i32 149, i32 152, i32 155, i32 158, i32 161, i32 164, i32 167, i32 170, i32 173, i32 176, i32 179, i32 182, i32 185, i32 188, i32 191>
4105 store <64 x i32> %strided.vec0, ptr %out.vec0, align 64
4106 store <64 x i32> %strided.vec1, ptr %out.vec1, align 64
4107 store <64 x i32> %strided.vec2, ptr %out.vec2, align 64