1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
16 ; These patterns are produced by LoopVectorizer for interleaved loads.
18 define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind {
19 ; SSE-LABEL: load_i32_stride4_vf2:
21 ; SSE-NEXT: movdqa (%rdi), %xmm0
22 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
23 ; SSE-NEXT: movdqa %xmm0, %xmm2
24 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
25 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
26 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
27 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
28 ; SSE-NEXT: movq %xmm2, (%rsi)
29 ; SSE-NEXT: movq %xmm3, (%rdx)
30 ; SSE-NEXT: movq %xmm0, (%rcx)
31 ; SSE-NEXT: movq %xmm1, (%r8)
34 ; AVX-LABEL: load_i32_stride4_vf2:
36 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
37 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
38 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
39 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
40 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3],xmm3[4,5,6,7]
41 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
42 ; AVX-NEXT: vmovq %xmm2, (%rsi)
43 ; AVX-NEXT: vmovq %xmm3, (%rdx)
44 ; AVX-NEXT: vmovq %xmm0, (%rcx)
45 ; AVX-NEXT: vpextrq $1, %xmm0, (%r8)
48 ; AVX2-LABEL: load_i32_stride4_vf2:
50 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
51 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
52 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
53 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
54 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
55 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
56 ; AVX2-NEXT: vmovq %xmm2, (%rsi)
57 ; AVX2-NEXT: vmovq %xmm3, (%rdx)
58 ; AVX2-NEXT: vmovq %xmm0, (%rcx)
59 ; AVX2-NEXT: vpextrq $1, %xmm0, (%r8)
62 ; AVX2-FP-LABEL: load_i32_stride4_vf2:
64 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
65 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1
66 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
67 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
68 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
69 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
70 ; AVX2-FP-NEXT: vmovq %xmm2, (%rsi)
71 ; AVX2-FP-NEXT: vmovq %xmm3, (%rdx)
72 ; AVX2-FP-NEXT: vmovq %xmm0, (%rcx)
73 ; AVX2-FP-NEXT: vpextrq $1, %xmm0, (%r8)
76 ; AVX2-FCP-LABEL: load_i32_stride4_vf2:
78 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
79 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
80 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
81 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
82 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
83 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
84 ; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi)
85 ; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx)
86 ; AVX2-FCP-NEXT: vmovq %xmm0, (%rcx)
87 ; AVX2-FCP-NEXT: vpextrq $1, %xmm0, (%r8)
90 ; AVX512-LABEL: load_i32_stride4_vf2:
92 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
93 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
94 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
95 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
96 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
97 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
98 ; AVX512-NEXT: vmovq %xmm2, (%rsi)
99 ; AVX512-NEXT: vmovq %xmm3, (%rdx)
100 ; AVX512-NEXT: vmovq %xmm0, (%rcx)
101 ; AVX512-NEXT: vpextrq $1, %xmm0, (%r8)
104 ; AVX512-FCP-LABEL: load_i32_stride4_vf2:
105 ; AVX512-FCP: # %bb.0:
106 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
107 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
108 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
109 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1]
110 ; AVX512-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3
111 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
112 ; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi)
113 ; AVX512-FCP-NEXT: vmovlps %xmm3, (%rdx)
114 ; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx)
115 ; AVX512-FCP-NEXT: vpextrq $1, %xmm0, (%r8)
116 ; AVX512-FCP-NEXT: vzeroupper
117 ; AVX512-FCP-NEXT: retq
119 ; AVX512DQ-LABEL: load_i32_stride4_vf2:
121 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
122 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1
123 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
124 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
125 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
126 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
127 ; AVX512DQ-NEXT: vmovq %xmm2, (%rsi)
128 ; AVX512DQ-NEXT: vmovq %xmm3, (%rdx)
129 ; AVX512DQ-NEXT: vmovq %xmm0, (%rcx)
130 ; AVX512DQ-NEXT: vpextrq $1, %xmm0, (%r8)
131 ; AVX512DQ-NEXT: retq
133 ; AVX512DQ-FCP-LABEL: load_i32_stride4_vf2:
134 ; AVX512DQ-FCP: # %bb.0:
135 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
136 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
137 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
138 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1]
139 ; AVX512DQ-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3
140 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
141 ; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi)
142 ; AVX512DQ-FCP-NEXT: vmovlps %xmm3, (%rdx)
143 ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx)
144 ; AVX512DQ-FCP-NEXT: vpextrq $1, %xmm0, (%r8)
145 ; AVX512DQ-FCP-NEXT: vzeroupper
146 ; AVX512DQ-FCP-NEXT: retq
148 ; AVX512BW-LABEL: load_i32_stride4_vf2:
150 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
151 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
152 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
153 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
154 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
155 ; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
156 ; AVX512BW-NEXT: vmovq %xmm2, (%rsi)
157 ; AVX512BW-NEXT: vmovq %xmm3, (%rdx)
158 ; AVX512BW-NEXT: vmovq %xmm0, (%rcx)
159 ; AVX512BW-NEXT: vpextrq $1, %xmm0, (%r8)
160 ; AVX512BW-NEXT: retq
162 ; AVX512BW-FCP-LABEL: load_i32_stride4_vf2:
163 ; AVX512BW-FCP: # %bb.0:
164 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
165 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
166 ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
167 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1]
168 ; AVX512BW-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3
169 ; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
170 ; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi)
171 ; AVX512BW-FCP-NEXT: vmovlps %xmm3, (%rdx)
172 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx)
173 ; AVX512BW-FCP-NEXT: vpextrq $1, %xmm0, (%r8)
174 ; AVX512BW-FCP-NEXT: vzeroupper
175 ; AVX512BW-FCP-NEXT: retq
177 ; AVX512DQ-BW-LABEL: load_i32_stride4_vf2:
178 ; AVX512DQ-BW: # %bb.0:
179 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
180 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm1
181 ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
182 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
183 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
184 ; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
185 ; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi)
186 ; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx)
187 ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx)
188 ; AVX512DQ-BW-NEXT: vpextrq $1, %xmm0, (%r8)
189 ; AVX512DQ-BW-NEXT: retq
191 ; AVX512DQ-BW-FCP-LABEL: load_i32_stride4_vf2:
192 ; AVX512DQ-BW-FCP: # %bb.0:
193 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
194 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
195 ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
196 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1]
197 ; AVX512DQ-BW-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3
198 ; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
199 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi)
200 ; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm3, (%rdx)
201 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx)
202 ; AVX512DQ-BW-FCP-NEXT: vpextrq $1, %xmm0, (%r8)
203 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
204 ; AVX512DQ-BW-FCP-NEXT: retq
205 %wide.vec = load <8 x i32>, ptr %in.vec, align 64
206 %strided.vec0 = shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <2 x i32> <i32 0, i32 4>
207 %strided.vec1 = shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <2 x i32> <i32 1, i32 5>
208 %strided.vec2 = shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <2 x i32> <i32 2, i32 6>
209 %strided.vec3 = shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <2 x i32> <i32 3, i32 7>
210 store <2 x i32> %strided.vec0, ptr %out.vec0, align 64
211 store <2 x i32> %strided.vec1, ptr %out.vec1, align 64
212 store <2 x i32> %strided.vec2, ptr %out.vec2, align 64
213 store <2 x i32> %strided.vec3, ptr %out.vec3, align 64
217 define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind {
218 ; SSE-LABEL: load_i32_stride4_vf4:
220 ; SSE-NEXT: movaps (%rdi), %xmm0
221 ; SSE-NEXT: movaps 16(%rdi), %xmm1
222 ; SSE-NEXT: movaps 32(%rdi), %xmm2
223 ; SSE-NEXT: movaps 48(%rdi), %xmm3
224 ; SSE-NEXT: movaps %xmm2, %xmm4
225 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
226 ; SSE-NEXT: movaps %xmm0, %xmm5
227 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
228 ; SSE-NEXT: movaps %xmm5, %xmm6
229 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm4[0]
230 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm4[1]
231 ; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
232 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
233 ; SSE-NEXT: movaps %xmm0, %xmm1
234 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
235 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
236 ; SSE-NEXT: movaps %xmm6, (%rsi)
237 ; SSE-NEXT: movaps %xmm5, (%rdx)
238 ; SSE-NEXT: movaps %xmm1, (%rcx)
239 ; SSE-NEXT: movaps %xmm0, (%r8)
242 ; AVX-LABEL: load_i32_stride4_vf4:
244 ; AVX-NEXT: vmovaps (%rdi), %xmm0
245 ; AVX-NEXT: vmovaps 16(%rdi), %xmm1
246 ; AVX-NEXT: vmovaps 32(%rdi), %xmm2
247 ; AVX-NEXT: vmovaps 48(%rdi), %xmm3
248 ; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0]
249 ; AVX-NEXT: vunpcklps {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
250 ; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,0]
251 ; AVX-NEXT: vinsertps {{.*#+}} xmm5 = xmm0[1],xmm1[1],zero,zero
252 ; AVX-NEXT: vunpcklps {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
253 ; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
254 ; AVX-NEXT: vunpckhps {{.*#+}} xmm6 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
255 ; AVX-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm2[2],xmm3[2]
256 ; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
257 ; AVX-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
258 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,0],xmm0[3,0]
259 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
260 ; AVX-NEXT: vmovaps %xmm4, (%rsi)
261 ; AVX-NEXT: vmovaps %xmm5, (%rdx)
262 ; AVX-NEXT: vmovaps %xmm6, (%rcx)
263 ; AVX-NEXT: vmovaps %xmm0, (%r8)
266 ; AVX2-LABEL: load_i32_stride4_vf4:
268 ; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = [0,4,0,4]
269 ; AVX2-NEXT: # xmm0 = mem[0,0]
270 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm1
271 ; AVX2-NEXT: vpermps %ymm1, %ymm0, %ymm0
272 ; AVX2-NEXT: vmovaps (%rdi), %xmm2
273 ; AVX2-NEXT: vmovaps 16(%rdi), %xmm3
274 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm4
275 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
276 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
277 ; AVX2-NEXT: vmovaps 48(%rdi), %xmm5
278 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
279 ; AVX2-NEXT: vmovsd {{.*#+}} xmm7 = [1,5,0,0]
280 ; AVX2-NEXT: vmovaps (%rdi), %ymm8
281 ; AVX2-NEXT: vpermps %ymm8, %ymm7, %ymm7
282 ; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
283 ; AVX2-NEXT: vmovddup {{.*#+}} xmm7 = [2,6,2,6]
284 ; AVX2-NEXT: # xmm7 = mem[0,0]
285 ; AVX2-NEXT: vpermps %ymm1, %ymm7, %ymm1
286 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
287 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
288 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
289 ; AVX2-NEXT: vmovsd {{.*#+}} xmm3 = [3,7,0,0]
290 ; AVX2-NEXT: vpermps %ymm8, %ymm3, %ymm3
291 ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
292 ; AVX2-NEXT: vmovaps %xmm0, (%rsi)
293 ; AVX2-NEXT: vmovaps %xmm6, (%rdx)
294 ; AVX2-NEXT: vmovaps %xmm1, (%rcx)
295 ; AVX2-NEXT: vmovaps %xmm2, (%r8)
296 ; AVX2-NEXT: vzeroupper
299 ; AVX2-FP-LABEL: load_i32_stride4_vf4:
301 ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm0 = [0,4,0,4]
302 ; AVX2-FP-NEXT: # xmm0 = mem[0,0]
303 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm1
304 ; AVX2-FP-NEXT: vpermps %ymm1, %ymm0, %ymm0
305 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm2
306 ; AVX2-FP-NEXT: vmovaps 16(%rdi), %xmm3
307 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm4
308 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
309 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
310 ; AVX2-FP-NEXT: vmovaps 48(%rdi), %xmm5
311 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
312 ; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm7 = [1,5,0,0]
313 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm8
314 ; AVX2-FP-NEXT: vpermps %ymm8, %ymm7, %ymm7
315 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
316 ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm7 = [2,6,2,6]
317 ; AVX2-FP-NEXT: # xmm7 = mem[0,0]
318 ; AVX2-FP-NEXT: vpermps %ymm1, %ymm7, %ymm1
319 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
320 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
321 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
322 ; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm3 = [3,7,0,0]
323 ; AVX2-FP-NEXT: vpermps %ymm8, %ymm3, %ymm3
324 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
325 ; AVX2-FP-NEXT: vmovaps %xmm0, (%rsi)
326 ; AVX2-FP-NEXT: vmovaps %xmm6, (%rdx)
327 ; AVX2-FP-NEXT: vmovaps %xmm1, (%rcx)
328 ; AVX2-FP-NEXT: vmovaps %xmm2, (%r8)
329 ; AVX2-FP-NEXT: vzeroupper
332 ; AVX2-FCP-LABEL: load_i32_stride4_vf4:
334 ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm0 = [0,4,0,4]
335 ; AVX2-FCP-NEXT: # xmm0 = mem[0,0]
336 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm1
337 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0
338 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm2
339 ; AVX2-FCP-NEXT: vmovaps 16(%rdi), %xmm3
340 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm4
341 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
342 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
343 ; AVX2-FCP-NEXT: vmovaps 48(%rdi), %xmm5
344 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
345 ; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm7 = [1,5,0,0]
346 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm8
347 ; AVX2-FCP-NEXT: vpermps %ymm8, %ymm7, %ymm7
348 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
349 ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm7 = [2,6,2,6]
350 ; AVX2-FCP-NEXT: # xmm7 = mem[0,0]
351 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm7, %ymm1
352 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
353 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
354 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
355 ; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm3 = [3,7,0,0]
356 ; AVX2-FCP-NEXT: vpermps %ymm8, %ymm3, %ymm3
357 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
358 ; AVX2-FCP-NEXT: vmovaps %xmm0, (%rsi)
359 ; AVX2-FCP-NEXT: vmovaps %xmm6, (%rdx)
360 ; AVX2-FCP-NEXT: vmovaps %xmm1, (%rcx)
361 ; AVX2-FCP-NEXT: vmovaps %xmm2, (%r8)
362 ; AVX2-FCP-NEXT: vzeroupper
363 ; AVX2-FCP-NEXT: retq
365 ; AVX512-LABEL: load_i32_stride4_vf4:
367 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12]
368 ; AVX512-NEXT: vmovaps (%rdi), %zmm1
369 ; AVX512-NEXT: vpermps %zmm1, %zmm0, %zmm0
370 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13]
371 ; AVX512-NEXT: vpermps %zmm1, %zmm2, %zmm2
372 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14]
373 ; AVX512-NEXT: vpermps %zmm1, %zmm3, %zmm3
374 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15]
375 ; AVX512-NEXT: vpermps %zmm1, %zmm4, %zmm1
376 ; AVX512-NEXT: vmovaps %xmm0, (%rsi)
377 ; AVX512-NEXT: vmovaps %xmm2, (%rdx)
378 ; AVX512-NEXT: vmovaps %xmm3, (%rcx)
379 ; AVX512-NEXT: vmovaps %xmm1, (%r8)
380 ; AVX512-NEXT: vzeroupper
383 ; AVX512-FCP-LABEL: load_i32_stride4_vf4:
384 ; AVX512-FCP: # %bb.0:
385 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12]
386 ; AVX512-FCP-NEXT: vmovaps (%rdi), %zmm1
387 ; AVX512-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0
388 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13]
389 ; AVX512-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2
390 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14]
391 ; AVX512-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm3
392 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15]
393 ; AVX512-FCP-NEXT: vpermps %zmm1, %zmm4, %zmm1
394 ; AVX512-FCP-NEXT: vmovaps %xmm0, (%rsi)
395 ; AVX512-FCP-NEXT: vmovaps %xmm2, (%rdx)
396 ; AVX512-FCP-NEXT: vmovaps %xmm3, (%rcx)
397 ; AVX512-FCP-NEXT: vmovaps %xmm1, (%r8)
398 ; AVX512-FCP-NEXT: vzeroupper
399 ; AVX512-FCP-NEXT: retq
401 ; AVX512DQ-LABEL: load_i32_stride4_vf4:
403 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12]
404 ; AVX512DQ-NEXT: vmovaps (%rdi), %zmm1
405 ; AVX512DQ-NEXT: vpermps %zmm1, %zmm0, %zmm0
406 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13]
407 ; AVX512DQ-NEXT: vpermps %zmm1, %zmm2, %zmm2
408 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14]
409 ; AVX512DQ-NEXT: vpermps %zmm1, %zmm3, %zmm3
410 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15]
411 ; AVX512DQ-NEXT: vpermps %zmm1, %zmm4, %zmm1
412 ; AVX512DQ-NEXT: vmovaps %xmm0, (%rsi)
413 ; AVX512DQ-NEXT: vmovaps %xmm2, (%rdx)
414 ; AVX512DQ-NEXT: vmovaps %xmm3, (%rcx)
415 ; AVX512DQ-NEXT: vmovaps %xmm1, (%r8)
416 ; AVX512DQ-NEXT: vzeroupper
417 ; AVX512DQ-NEXT: retq
419 ; AVX512DQ-FCP-LABEL: load_i32_stride4_vf4:
420 ; AVX512DQ-FCP: # %bb.0:
421 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12]
422 ; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %zmm1
423 ; AVX512DQ-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0
424 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13]
425 ; AVX512DQ-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2
426 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14]
427 ; AVX512DQ-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm3
428 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15]
429 ; AVX512DQ-FCP-NEXT: vpermps %zmm1, %zmm4, %zmm1
430 ; AVX512DQ-FCP-NEXT: vmovaps %xmm0, (%rsi)
431 ; AVX512DQ-FCP-NEXT: vmovaps %xmm2, (%rdx)
432 ; AVX512DQ-FCP-NEXT: vmovaps %xmm3, (%rcx)
433 ; AVX512DQ-FCP-NEXT: vmovaps %xmm1, (%r8)
434 ; AVX512DQ-FCP-NEXT: vzeroupper
435 ; AVX512DQ-FCP-NEXT: retq
437 ; AVX512BW-LABEL: load_i32_stride4_vf4:
439 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12]
440 ; AVX512BW-NEXT: vmovaps (%rdi), %zmm1
441 ; AVX512BW-NEXT: vpermps %zmm1, %zmm0, %zmm0
442 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13]
443 ; AVX512BW-NEXT: vpermps %zmm1, %zmm2, %zmm2
444 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14]
445 ; AVX512BW-NEXT: vpermps %zmm1, %zmm3, %zmm3
446 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15]
447 ; AVX512BW-NEXT: vpermps %zmm1, %zmm4, %zmm1
448 ; AVX512BW-NEXT: vmovaps %xmm0, (%rsi)
449 ; AVX512BW-NEXT: vmovaps %xmm2, (%rdx)
450 ; AVX512BW-NEXT: vmovaps %xmm3, (%rcx)
451 ; AVX512BW-NEXT: vmovaps %xmm1, (%r8)
452 ; AVX512BW-NEXT: vzeroupper
453 ; AVX512BW-NEXT: retq
455 ; AVX512BW-FCP-LABEL: load_i32_stride4_vf4:
456 ; AVX512BW-FCP: # %bb.0:
457 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12]
458 ; AVX512BW-FCP-NEXT: vmovaps (%rdi), %zmm1
459 ; AVX512BW-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0
460 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13]
461 ; AVX512BW-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2
462 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14]
463 ; AVX512BW-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm3
464 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15]
465 ; AVX512BW-FCP-NEXT: vpermps %zmm1, %zmm4, %zmm1
466 ; AVX512BW-FCP-NEXT: vmovaps %xmm0, (%rsi)
467 ; AVX512BW-FCP-NEXT: vmovaps %xmm2, (%rdx)
468 ; AVX512BW-FCP-NEXT: vmovaps %xmm3, (%rcx)
469 ; AVX512BW-FCP-NEXT: vmovaps %xmm1, (%r8)
470 ; AVX512BW-FCP-NEXT: vzeroupper
471 ; AVX512BW-FCP-NEXT: retq
473 ; AVX512DQ-BW-LABEL: load_i32_stride4_vf4:
474 ; AVX512DQ-BW: # %bb.0:
475 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12]
476 ; AVX512DQ-BW-NEXT: vmovaps (%rdi), %zmm1
477 ; AVX512DQ-BW-NEXT: vpermps %zmm1, %zmm0, %zmm0
478 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13]
479 ; AVX512DQ-BW-NEXT: vpermps %zmm1, %zmm2, %zmm2
480 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14]
481 ; AVX512DQ-BW-NEXT: vpermps %zmm1, %zmm3, %zmm3
482 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15]
483 ; AVX512DQ-BW-NEXT: vpermps %zmm1, %zmm4, %zmm1
484 ; AVX512DQ-BW-NEXT: vmovaps %xmm0, (%rsi)
485 ; AVX512DQ-BW-NEXT: vmovaps %xmm2, (%rdx)
486 ; AVX512DQ-BW-NEXT: vmovaps %xmm3, (%rcx)
487 ; AVX512DQ-BW-NEXT: vmovaps %xmm1, (%r8)
488 ; AVX512DQ-BW-NEXT: vzeroupper
489 ; AVX512DQ-BW-NEXT: retq
491 ; AVX512DQ-BW-FCP-LABEL: load_i32_stride4_vf4:
492 ; AVX512DQ-BW-FCP: # %bb.0:
493 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12]
494 ; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %zmm1
495 ; AVX512DQ-BW-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0
496 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13]
497 ; AVX512DQ-BW-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2
498 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14]
499 ; AVX512DQ-BW-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm3
500 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15]
501 ; AVX512DQ-BW-FCP-NEXT: vpermps %zmm1, %zmm4, %zmm1
502 ; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm0, (%rsi)
503 ; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm2, (%rdx)
504 ; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm3, (%rcx)
505 ; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm1, (%r8)
506 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
507 ; AVX512DQ-BW-FCP-NEXT: retq
508 %wide.vec = load <16 x i32>, ptr %in.vec, align 64
509 %strided.vec0 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
510 %strided.vec1 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
511 %strided.vec2 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
512 %strided.vec3 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
513 store <4 x i32> %strided.vec0, ptr %out.vec0, align 64
514 store <4 x i32> %strided.vec1, ptr %out.vec1, align 64
515 store <4 x i32> %strided.vec2, ptr %out.vec2, align 64
516 store <4 x i32> %strided.vec3, ptr %out.vec3, align 64
520 define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind {
521 ; SSE-LABEL: load_i32_stride4_vf8:
523 ; SSE-NEXT: movaps (%rdi), %xmm0
524 ; SSE-NEXT: movaps 16(%rdi), %xmm3
525 ; SSE-NEXT: movaps 32(%rdi), %xmm2
526 ; SSE-NEXT: movaps 48(%rdi), %xmm4
527 ; SSE-NEXT: movaps 80(%rdi), %xmm5
528 ; SSE-NEXT: movaps 64(%rdi), %xmm1
529 ; SSE-NEXT: movaps 112(%rdi), %xmm6
530 ; SSE-NEXT: movaps 96(%rdi), %xmm7
531 ; SSE-NEXT: movaps %xmm7, %xmm8
532 ; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
533 ; SSE-NEXT: movaps %xmm1, %xmm9
534 ; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1]
535 ; SSE-NEXT: movaps %xmm9, %xmm10
536 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm8[0]
537 ; SSE-NEXT: movaps %xmm2, %xmm11
538 ; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1]
539 ; SSE-NEXT: movaps %xmm0, %xmm12
540 ; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1]
541 ; SSE-NEXT: movaps %xmm12, %xmm13
542 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm11[0]
543 ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm8[1]
544 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1]
545 ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
546 ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
547 ; SSE-NEXT: movaps %xmm1, %xmm5
548 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm7[0]
549 ; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
550 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
551 ; SSE-NEXT: movaps %xmm0, %xmm3
552 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
553 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1]
554 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
555 ; SSE-NEXT: movaps %xmm10, 16(%rsi)
556 ; SSE-NEXT: movaps %xmm13, (%rsi)
557 ; SSE-NEXT: movaps %xmm9, 16(%rdx)
558 ; SSE-NEXT: movaps %xmm12, (%rdx)
559 ; SSE-NEXT: movaps %xmm5, 16(%rcx)
560 ; SSE-NEXT: movaps %xmm3, (%rcx)
561 ; SSE-NEXT: movaps %xmm1, 16(%r8)
562 ; SSE-NEXT: movaps %xmm0, (%r8)
565 ; AVX-LABEL: load_i32_stride4_vf8:
567 ; AVX-NEXT: vmovaps 64(%rdi), %ymm0
568 ; AVX-NEXT: vmovaps 96(%rdi), %ymm1
569 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
570 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
571 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,0,1]
572 ; AVX-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[4],ymm0[4],ymm4[5],ymm0[5]
573 ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,0],ymm5[4,5],ymm3[6,4]
574 ; AVX-NEXT: vmovaps (%rdi), %xmm5
575 ; AVX-NEXT: vmovaps 16(%rdi), %xmm6
576 ; AVX-NEXT: vmovaps 32(%rdi), %xmm7
577 ; AVX-NEXT: vmovaps 48(%rdi), %xmm8
578 ; AVX-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm7[0]
579 ; AVX-NEXT: vunpcklps {{.*#+}} xmm10 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
580 ; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,0]
581 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3],ymm3[4,5,6,7]
582 ; AVX-NEXT: vunpcklps {{.*#+}} ymm9 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
583 ; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm0[1,0],ymm4[1,0],ymm0[5,4],ymm4[5,4]
584 ; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm10[2,0],ymm9[2,3],ymm10[6,4],ymm9[6,7]
585 ; AVX-NEXT: vinsertps {{.*#+}} xmm10 = xmm5[1],xmm6[1],zero,zero
586 ; AVX-NEXT: vunpcklps {{.*#+}} xmm11 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
587 ; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3]
588 ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
589 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
590 ; AVX-NEXT: vunpckhps {{.*#+}} ymm11 = ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[6],ymm0[6],ymm4[7],ymm0[7]
591 ; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,0],ymm11[4,5],ymm10[6,4]
592 ; AVX-NEXT: vunpckhps {{.*#+}} xmm11 = xmm5[2],xmm6[2],xmm5[3],xmm6[3]
593 ; AVX-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm7[2],xmm8[2]
594 ; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3]
595 ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
596 ; AVX-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7]
597 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm4[3,0],ymm0[7,4],ymm4[7,4]
598 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,3],ymm0[6,4],ymm1[6,7]
599 ; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm8[2],xmm7[3],xmm8[3]
600 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm6[3,0],xmm5[3,0]
601 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3]
602 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
603 ; AVX-NEXT: vmovaps %ymm3, (%rsi)
604 ; AVX-NEXT: vmovaps %ymm9, (%rdx)
605 ; AVX-NEXT: vmovaps %ymm10, (%rcx)
606 ; AVX-NEXT: vmovaps %ymm0, (%r8)
607 ; AVX-NEXT: vzeroupper
610 ; AVX2-LABEL: load_i32_stride4_vf8:
612 ; AVX2-NEXT: vmovaps (%rdi), %ymm0
613 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm3
614 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm1
615 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm2
616 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [0,4,0,4,0,4,0,4]
617 ; AVX2-NEXT: vpermps %ymm2, %ymm4, %ymm5
618 ; AVX2-NEXT: vpermps %ymm1, %ymm4, %ymm6
619 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
620 ; AVX2-NEXT: vpermps %ymm3, %ymm4, %ymm4
621 ; AVX2-NEXT: vmovaps (%rdi), %xmm6
622 ; AVX2-NEXT: vmovaps 16(%rdi), %xmm7
623 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm8
624 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm9 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
625 ; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3]
626 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
627 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm5 = [1,5,1,5,1,5,1,5]
628 ; AVX2-NEXT: vpermps %ymm2, %ymm5, %ymm9
629 ; AVX2-NEXT: vpermps %ymm1, %ymm5, %ymm10
630 ; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
631 ; AVX2-NEXT: vmovaps 48(%rdi), %xmm10
632 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm11 = xmm8[0],xmm10[0],xmm8[1],xmm10[1]
633 ; AVX2-NEXT: vpermps %ymm0, %ymm5, %ymm5
634 ; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm11[2,3]
635 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7]
636 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm9 = [2,6,2,6,2,6,2,6]
637 ; AVX2-NEXT: vpermps %ymm2, %ymm9, %ymm11
638 ; AVX2-NEXT: vpermps %ymm1, %ymm9, %ymm12
639 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
640 ; AVX2-NEXT: vpermps %ymm3, %ymm9, %ymm3
641 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
642 ; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3]
643 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
644 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [3,7,3,7,3,7,3,7]
645 ; AVX2-NEXT: vpermps %ymm2, %ymm6, %ymm2
646 ; AVX2-NEXT: vpermps %ymm1, %ymm6, %ymm1
647 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
648 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm8[2],xmm10[2],xmm8[3],xmm10[3]
649 ; AVX2-NEXT: vpermps %ymm0, %ymm6, %ymm0
650 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
651 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
652 ; AVX2-NEXT: vmovaps %ymm4, (%rsi)
653 ; AVX2-NEXT: vmovaps %ymm5, (%rdx)
654 ; AVX2-NEXT: vmovaps %ymm3, (%rcx)
655 ; AVX2-NEXT: vmovaps %ymm0, (%r8)
656 ; AVX2-NEXT: vzeroupper
659 ; AVX2-FP-LABEL: load_i32_stride4_vf8:
661 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0
662 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm3
663 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm1
664 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm2
665 ; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm4 = [0,4,0,4,0,4,0,4]
666 ; AVX2-FP-NEXT: vpermps %ymm2, %ymm4, %ymm5
667 ; AVX2-FP-NEXT: vpermps %ymm1, %ymm4, %ymm6
668 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
669 ; AVX2-FP-NEXT: vpermps %ymm3, %ymm4, %ymm4
670 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm6
671 ; AVX2-FP-NEXT: vmovaps 16(%rdi), %xmm7
672 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm8
673 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm9 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
674 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3]
675 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
676 ; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm5 = [1,5,1,5,1,5,1,5]
677 ; AVX2-FP-NEXT: vpermps %ymm2, %ymm5, %ymm9
678 ; AVX2-FP-NEXT: vpermps %ymm1, %ymm5, %ymm10
679 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
680 ; AVX2-FP-NEXT: vmovaps 48(%rdi), %xmm10
681 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm11 = xmm8[0],xmm10[0],xmm8[1],xmm10[1]
682 ; AVX2-FP-NEXT: vpermps %ymm0, %ymm5, %ymm5
683 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm11[2,3]
684 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7]
685 ; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm9 = [2,6,2,6,2,6,2,6]
686 ; AVX2-FP-NEXT: vpermps %ymm2, %ymm9, %ymm11
687 ; AVX2-FP-NEXT: vpermps %ymm1, %ymm9, %ymm12
688 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
689 ; AVX2-FP-NEXT: vpermps %ymm3, %ymm9, %ymm3
690 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
691 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3]
692 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
693 ; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm6 = [3,7,3,7,3,7,3,7]
694 ; AVX2-FP-NEXT: vpermps %ymm2, %ymm6, %ymm2
695 ; AVX2-FP-NEXT: vpermps %ymm1, %ymm6, %ymm1
696 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
697 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm8[2],xmm10[2],xmm8[3],xmm10[3]
698 ; AVX2-FP-NEXT: vpermps %ymm0, %ymm6, %ymm0
699 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
700 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
701 ; AVX2-FP-NEXT: vmovaps %ymm4, (%rsi)
702 ; AVX2-FP-NEXT: vmovaps %ymm5, (%rdx)
703 ; AVX2-FP-NEXT: vmovaps %ymm3, (%rcx)
704 ; AVX2-FP-NEXT: vmovaps %ymm0, (%r8)
705 ; AVX2-FP-NEXT: vzeroupper
708 ; AVX2-FCP-LABEL: load_i32_stride4_vf8:
710 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0
711 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm3
712 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm1
713 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm2
714 ; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm4 = [0,4,0,4,0,4,0,4]
715 ; AVX2-FCP-NEXT: vpermps %ymm2, %ymm4, %ymm5
716 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm6
717 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
718 ; AVX2-FCP-NEXT: vpermps %ymm3, %ymm4, %ymm4
719 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm6
720 ; AVX2-FCP-NEXT: vmovaps 16(%rdi), %xmm7
721 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm8
722 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm9 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
723 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3]
724 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
725 ; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm5 = [1,5,1,5,1,5,1,5]
726 ; AVX2-FCP-NEXT: vpermps %ymm2, %ymm5, %ymm9
727 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm10
728 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
729 ; AVX2-FCP-NEXT: vmovaps 48(%rdi), %xmm10
730 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm11 = xmm8[0],xmm10[0],xmm8[1],xmm10[1]
731 ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm5, %ymm5
732 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm11[2,3]
733 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7]
734 ; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm9 = [2,6,2,6,2,6,2,6]
735 ; AVX2-FCP-NEXT: vpermps %ymm2, %ymm9, %ymm11
736 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm12
737 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
738 ; AVX2-FCP-NEXT: vpermps %ymm3, %ymm9, %ymm3
739 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
740 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3]
741 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
742 ; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm6 = [3,7,3,7,3,7,3,7]
743 ; AVX2-FCP-NEXT: vpermps %ymm2, %ymm6, %ymm2
744 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm6, %ymm1
745 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
746 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm8[2],xmm10[2],xmm8[3],xmm10[3]
747 ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm6, %ymm0
748 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
749 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
750 ; AVX2-FCP-NEXT: vmovaps %ymm4, (%rsi)
751 ; AVX2-FCP-NEXT: vmovaps %ymm5, (%rdx)
752 ; AVX2-FCP-NEXT: vmovaps %ymm3, (%rcx)
753 ; AVX2-FCP-NEXT: vmovaps %ymm0, (%r8)
754 ; AVX2-FCP-NEXT: vzeroupper
755 ; AVX2-FCP-NEXT: retq
757 ; AVX512-LABEL: load_i32_stride4_vf8:
759 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28]
760 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1
761 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm2
762 ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm0
763 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29]
764 ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
765 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30]
766 ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
767 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31]
768 ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm5
769 ; AVX512-NEXT: vmovdqa %ymm0, (%rsi)
770 ; AVX512-NEXT: vmovdqa %ymm3, (%rdx)
771 ; AVX512-NEXT: vmovdqa %ymm4, (%rcx)
772 ; AVX512-NEXT: vmovdqa %ymm5, (%r8)
773 ; AVX512-NEXT: vzeroupper
776 ; AVX512-FCP-LABEL: load_i32_stride4_vf8:
777 ; AVX512-FCP: # %bb.0:
778 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28]
779 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
780 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
781 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0
782 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29]
783 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
784 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30]
785 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
786 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31]
787 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm5
788 ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rsi)
789 ; AVX512-FCP-NEXT: vmovdqa %ymm3, (%rdx)
790 ; AVX512-FCP-NEXT: vmovdqa %ymm4, (%rcx)
791 ; AVX512-FCP-NEXT: vmovdqa %ymm5, (%r8)
792 ; AVX512-FCP-NEXT: vzeroupper
793 ; AVX512-FCP-NEXT: retq
795 ; AVX512DQ-LABEL: load_i32_stride4_vf8:
797 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28]
798 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm1
799 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm2
800 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm0
801 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29]
802 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
803 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30]
804 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
805 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31]
806 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm5
807 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rsi)
808 ; AVX512DQ-NEXT: vmovdqa %ymm3, (%rdx)
809 ; AVX512DQ-NEXT: vmovdqa %ymm4, (%rcx)
810 ; AVX512DQ-NEXT: vmovdqa %ymm5, (%r8)
811 ; AVX512DQ-NEXT: vzeroupper
812 ; AVX512DQ-NEXT: retq
814 ; AVX512DQ-FCP-LABEL: load_i32_stride4_vf8:
815 ; AVX512DQ-FCP: # %bb.0:
816 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28]
817 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
818 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
819 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0
820 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29]
821 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
822 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30]
823 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
824 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31]
825 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm5
826 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rsi)
827 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%rdx)
828 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%rcx)
829 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, (%r8)
830 ; AVX512DQ-FCP-NEXT: vzeroupper
831 ; AVX512DQ-FCP-NEXT: retq
833 ; AVX512BW-LABEL: load_i32_stride4_vf8:
835 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28]
836 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1
837 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2
838 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm0
839 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29]
840 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
841 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30]
842 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
843 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31]
844 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm5
845 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
846 ; AVX512BW-NEXT: vmovdqa %ymm3, (%rdx)
847 ; AVX512BW-NEXT: vmovdqa %ymm4, (%rcx)
848 ; AVX512BW-NEXT: vmovdqa %ymm5, (%r8)
849 ; AVX512BW-NEXT: vzeroupper
850 ; AVX512BW-NEXT: retq
852 ; AVX512BW-FCP-LABEL: load_i32_stride4_vf8:
853 ; AVX512BW-FCP: # %bb.0:
854 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28]
855 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
856 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
857 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0
858 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29]
859 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
860 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30]
861 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
862 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31]
863 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm5
864 ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rsi)
865 ; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rdx)
866 ; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%rcx)
867 ; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%r8)
868 ; AVX512BW-FCP-NEXT: vzeroupper
869 ; AVX512BW-FCP-NEXT: retq
871 ; AVX512DQ-BW-LABEL: load_i32_stride4_vf8:
872 ; AVX512DQ-BW: # %bb.0:
873 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28]
874 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1
875 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2
876 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm0
877 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29]
878 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
879 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30]
880 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
881 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31]
882 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm5
883 ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rsi)
884 ; AVX512DQ-BW-NEXT: vmovdqa %ymm3, (%rdx)
885 ; AVX512DQ-BW-NEXT: vmovdqa %ymm4, (%rcx)
886 ; AVX512DQ-BW-NEXT: vmovdqa %ymm5, (%r8)
887 ; AVX512DQ-BW-NEXT: vzeroupper
888 ; AVX512DQ-BW-NEXT: retq
890 ; AVX512DQ-BW-FCP-LABEL: load_i32_stride4_vf8:
891 ; AVX512DQ-BW-FCP: # %bb.0:
892 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28]
893 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
894 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
895 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0
896 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29]
897 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
898 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30]
899 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
900 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31]
901 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm5
902 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rsi)
903 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rdx)
904 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%rcx)
905 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%r8)
906 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
907 ; AVX512DQ-BW-FCP-NEXT: retq
908 %wide.vec = load <32 x i32>, ptr %in.vec, align 64
909 %strided.vec0 = shufflevector <32 x i32> %wide.vec, <32 x i32> poison, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
910 %strided.vec1 = shufflevector <32 x i32> %wide.vec, <32 x i32> poison, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
911 %strided.vec2 = shufflevector <32 x i32> %wide.vec, <32 x i32> poison, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
912 %strided.vec3 = shufflevector <32 x i32> %wide.vec, <32 x i32> poison, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
913 store <8 x i32> %strided.vec0, ptr %out.vec0, align 64
914 store <8 x i32> %strided.vec1, ptr %out.vec1, align 64
915 store <8 x i32> %strided.vec2, ptr %out.vec2, align 64
916 store <8 x i32> %strided.vec3, ptr %out.vec3, align 64
920 define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind {
921 ; SSE-LABEL: load_i32_stride4_vf16:
923 ; SSE-NEXT: subq $24, %rsp
924 ; SSE-NEXT: movaps 208(%rdi), %xmm2
925 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
926 ; SSE-NEXT: movaps 240(%rdi), %xmm4
927 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
928 ; SSE-NEXT: movaps 224(%rdi), %xmm12
929 ; SSE-NEXT: movaps 144(%rdi), %xmm6
930 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
931 ; SSE-NEXT: movaps 128(%rdi), %xmm3
932 ; SSE-NEXT: movaps 176(%rdi), %xmm5
933 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
934 ; SSE-NEXT: movaps 160(%rdi), %xmm9
935 ; SSE-NEXT: movaps 80(%rdi), %xmm1
936 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
937 ; SSE-NEXT: movaps 64(%rdi), %xmm11
938 ; SSE-NEXT: movaps 112(%rdi), %xmm8
939 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
940 ; SSE-NEXT: movaps 96(%rdi), %xmm7
941 ; SSE-NEXT: movaps %xmm7, %xmm0
942 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
943 ; SSE-NEXT: movaps %xmm11, %xmm13
944 ; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1]
945 ; SSE-NEXT: movaps %xmm13, %xmm1
946 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
947 ; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
948 ; SSE-NEXT: movaps %xmm9, %xmm1
949 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
950 ; SSE-NEXT: movaps %xmm3, %xmm5
951 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
952 ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1]
953 ; SSE-NEXT: movaps %xmm5, %xmm0
954 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
955 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
956 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
957 ; SSE-NEXT: movaps %xmm12, %xmm0
958 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
959 ; SSE-NEXT: movaps 192(%rdi), %xmm10
960 ; SSE-NEXT: movaps %xmm10, %xmm15
961 ; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1]
962 ; SSE-NEXT: movaps %xmm15, %xmm1
963 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
964 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
965 ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1]
966 ; SSE-NEXT: movaps 32(%rdi), %xmm2
967 ; SSE-NEXT: movaps 48(%rdi), %xmm8
968 ; SSE-NEXT: movaps %xmm2, %xmm1
969 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1]
970 ; SSE-NEXT: movaps (%rdi), %xmm0
971 ; SSE-NEXT: movaps 16(%rdi), %xmm6
972 ; SSE-NEXT: movaps %xmm0, %xmm14
973 ; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1]
974 ; SSE-NEXT: movaps %xmm14, %xmm4
975 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0]
976 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm1[1]
977 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
978 ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3]
979 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
980 ; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3]
981 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
982 ; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3]
983 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
984 ; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
985 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
986 ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3]
987 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
988 ; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3]
989 ; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3]
990 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3]
991 ; SSE-NEXT: movaps %xmm11, %xmm1
992 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0]
993 ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm7[1]
994 ; SSE-NEXT: movaps %xmm3, %xmm6
995 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm9[0]
996 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm9[1]
997 ; SSE-NEXT: movaps %xmm10, %xmm7
998 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm12[0]
999 ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm12[1]
1000 ; SSE-NEXT: movaps %xmm0, %xmm8
1001 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0]
1002 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
1003 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1004 ; SSE-NEXT: movaps %xmm2, 48(%rsi)
1005 ; SSE-NEXT: movaps %xmm4, (%rsi)
1006 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1007 ; SSE-NEXT: movaps %xmm2, 32(%rsi)
1008 ; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload
1009 ; SSE-NEXT: movaps %xmm2, 16(%rsi)
1010 ; SSE-NEXT: movaps %xmm15, 48(%rdx)
1011 ; SSE-NEXT: movaps %xmm14, (%rdx)
1012 ; SSE-NEXT: movaps %xmm5, 32(%rdx)
1013 ; SSE-NEXT: movaps %xmm13, 16(%rdx)
1014 ; SSE-NEXT: movaps %xmm7, 48(%rcx)
1015 ; SSE-NEXT: movaps %xmm6, 32(%rcx)
1016 ; SSE-NEXT: movaps %xmm1, 16(%rcx)
1017 ; SSE-NEXT: movaps %xmm8, (%rcx)
1018 ; SSE-NEXT: movaps %xmm10, 48(%r8)
1019 ; SSE-NEXT: movaps %xmm3, 32(%r8)
1020 ; SSE-NEXT: movaps %xmm11, 16(%r8)
1021 ; SSE-NEXT: movaps %xmm0, (%r8)
1022 ; SSE-NEXT: addq $24, %rsp
1025 ; AVX-LABEL: load_i32_stride4_vf16:
1027 ; AVX-NEXT: subq $264, %rsp # imm = 0x108
1028 ; AVX-NEXT: vmovaps 64(%rdi), %ymm5
1029 ; AVX-NEXT: vmovaps 96(%rdi), %ymm4
1030 ; AVX-NEXT: vmovaps 192(%rdi), %ymm10
1031 ; AVX-NEXT: vmovaps 224(%rdi), %ymm14
1032 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm14[2,3,0,1]
1033 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2]
1034 ; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1035 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm10[2,3,0,1]
1036 ; AVX-NEXT: vunpcklps {{.*#+}} ymm7 = ymm3[0],ymm10[0],ymm3[1],ymm10[1],ymm3[4],ymm10[4],ymm3[5],ymm10[5]
1037 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,0],ymm7[4,5],ymm0[6,4]
1038 ; AVX-NEXT: vmovaps 160(%rdi), %xmm1
1039 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1040 ; AVX-NEXT: vmovaps 176(%rdi), %xmm2
1041 ; AVX-NEXT: vmovlhps {{.*#+}} xmm11 = xmm2[0],xmm1[0]
1042 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1043 ; AVX-NEXT: vmovaps 144(%rdi), %xmm1
1044 ; AVX-NEXT: vmovaps 128(%rdi), %xmm6
1045 ; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1046 ; AVX-NEXT: vunpcklps {{.*#+}} xmm12 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
1047 ; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,0]
1048 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
1049 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1050 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm4[2,3,0,1]
1051 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm8[0],ymm4[2],ymm8[2]
1052 ; AVX-NEXT: vmovaps %ymm4, %ymm12
1053 ; AVX-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill
1054 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm5[2,3,0,1]
1055 ; AVX-NEXT: vunpcklps {{.*#+}} ymm13 = ymm11[0],ymm5[0],ymm11[1],ymm5[1],ymm11[4],ymm5[4],ymm11[5],ymm5[5]
1056 ; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm13[0,1],ymm0[2,0],ymm13[4,5],ymm0[6,4]
1057 ; AVX-NEXT: vmovaps (%rdi), %xmm9
1058 ; AVX-NEXT: vmovaps 16(%rdi), %xmm5
1059 ; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1060 ; AVX-NEXT: vmovaps 32(%rdi), %xmm6
1061 ; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1062 ; AVX-NEXT: vmovaps 48(%rdi), %xmm0
1063 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1064 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0]
1065 ; AVX-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm5[0],xmm9[1],xmm5[1]
1066 ; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1067 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,0]
1068 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
1069 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1070 ; AVX-NEXT: vmovaps %ymm15, %ymm4
1071 ; AVX-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1072 ; AVX-NEXT: vunpcklps {{.*#+}} ymm7 = ymm15[0],ymm14[0],ymm15[1],ymm14[1],ymm15[4],ymm14[4],ymm15[5],ymm14[5]
1073 ; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm10[1,0],ymm3[1,0],ymm10[5,4],ymm3[5,4]
1074 ; AVX-NEXT: vmovaps %ymm10, %ymm15
1075 ; AVX-NEXT: vmovaps %ymm3, %ymm13
1076 ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm6[2,0],ymm7[2,3],ymm6[6,4],ymm7[6,7]
1077 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
1078 ; AVX-NEXT: vinsertps {{.*#+}} xmm6 = xmm5[1],xmm1[1],zero,zero
1079 ; AVX-NEXT: vmovaps %xmm1, %xmm14
1080 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1081 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
1082 ; AVX-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm2[0],xmm10[1],xmm2[1]
1083 ; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
1084 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm3[4,5,6,7]
1085 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1086 ; AVX-NEXT: vmovaps %ymm12, %ymm0
1087 ; AVX-NEXT: vmovaps %ymm8, %ymm12
1088 ; AVX-NEXT: vunpcklps {{.*#+}} ymm3 = ymm8[0],ymm0[0],ymm8[1],ymm0[1],ymm8[4],ymm0[4],ymm8[5],ymm0[5]
1089 ; AVX-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload
1090 ; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm2[1,0],ymm11[1,0],ymm2[5,4],ymm11[5,4]
1091 ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm6[2,0],ymm3[2,3],ymm6[6,4],ymm3[6,7]
1092 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
1093 ; AVX-NEXT: vinsertps {{.*#+}} xmm6 = xmm9[1],xmm8[1],zero,zero
1094 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
1095 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1096 ; AVX-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm1[0],xmm9[1],xmm1[1]
1097 ; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
1098 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
1099 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1100 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1101 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3]
1102 ; AVX-NEXT: vunpckhps {{.*#+}} ymm6 = ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[6],ymm15[6],ymm13[7],ymm15[7]
1103 ; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,1],ymm3[2,0],ymm6[4,5],ymm3[6,4]
1104 ; AVX-NEXT: vunpckhps {{.*#+}} xmm6 = xmm5[2],xmm14[2],xmm5[3],xmm14[3]
1105 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1106 ; AVX-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm10[2],xmm3[2]
1107 ; AVX-NEXT: vmovaps %xmm10, %xmm14
1108 ; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
1109 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
1110 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1111 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
1112 ; AVX-NEXT: vmovaps %ymm0, %ymm5
1113 ; AVX-NEXT: vunpckhps {{.*#+}} ymm7 = ymm11[2],ymm2[2],ymm11[3],ymm2[3],ymm11[6],ymm2[6],ymm11[7],ymm2[7]
1114 ; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4]
1115 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
1116 ; AVX-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm8[2],xmm10[3],xmm8[3]
1117 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm9[2],xmm1[2]
1118 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3]
1119 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
1120 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1121 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload
1122 ; AVX-NEXT: # ymm4 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7]
1123 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm15[3,0],ymm13[3,0],ymm15[7,4],ymm13[7,4]
1124 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm4[2,3],ymm2[6,4],ymm4[6,7]
1125 ; AVX-NEXT: vunpckhps {{.*#+}} xmm4 = xmm14[2],xmm3[2],xmm14[3],xmm3[3]
1126 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1127 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload
1128 ; AVX-NEXT: # xmm6 = xmm3[3,0],mem[3,0]
1129 ; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,0],xmm4[2,3]
1130 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
1131 ; AVX-NEXT: vunpckhps {{.*#+}} ymm3 = ymm12[2],ymm5[2],ymm12[3],ymm5[3],ymm12[6],ymm5[6],ymm12[7],ymm5[7]
1132 ; AVX-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload
1133 ; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0],ymm11[3,0],ymm4[7,4],ymm11[7,4]
1134 ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,0],ymm3[2,3],ymm4[6,4],ymm3[6,7]
1135 ; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm1[2],xmm9[3],xmm1[3]
1136 ; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm8[3,0],xmm10[3,0]
1137 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,0],xmm1[2,3]
1138 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
1139 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1140 ; AVX-NEXT: vmovaps %ymm3, 32(%rsi)
1141 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1142 ; AVX-NEXT: vmovaps %ymm3, (%rsi)
1143 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1144 ; AVX-NEXT: vmovaps %ymm3, 32(%rdx)
1145 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1146 ; AVX-NEXT: vmovaps %ymm3, (%rdx)
1147 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1148 ; AVX-NEXT: vmovaps %ymm3, 32(%rcx)
1149 ; AVX-NEXT: vmovaps %ymm0, (%rcx)
1150 ; AVX-NEXT: vmovaps %ymm2, 32(%r8)
1151 ; AVX-NEXT: vmovaps %ymm1, (%r8)
1152 ; AVX-NEXT: addq $264, %rsp # imm = 0x108
1153 ; AVX-NEXT: vzeroupper
1156 ; AVX2-LABEL: load_i32_stride4_vf16:
1158 ; AVX2-NEXT: subq $104, %rsp
1159 ; AVX2-NEXT: vmovaps (%rdi), %ymm3
1160 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1161 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm8
1162 ; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1163 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm4
1164 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm5
1165 ; AVX2-NEXT: vmovaps 160(%rdi), %ymm9
1166 ; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1167 ; AVX2-NEXT: vmovaps 192(%rdi), %ymm7
1168 ; AVX2-NEXT: vmovaps 224(%rdi), %ymm2
1169 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm0 = [0,4,0,4,0,4,0,4]
1170 ; AVX2-NEXT: vpermps %ymm2, %ymm0, %ymm1
1171 ; AVX2-NEXT: vpermps %ymm7, %ymm0, %ymm6
1172 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
1173 ; AVX2-NEXT: vmovaps 144(%rdi), %xmm10
1174 ; AVX2-NEXT: vmovaps 128(%rdi), %xmm11
1175 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
1176 ; AVX2-NEXT: vpermps %ymm9, %ymm0, %ymm9
1177 ; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,3]
1178 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
1179 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1180 ; AVX2-NEXT: vpermps %ymm5, %ymm0, %ymm1
1181 ; AVX2-NEXT: vpermps %ymm4, %ymm0, %ymm6
1182 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
1183 ; AVX2-NEXT: vpermps %ymm8, %ymm0, %ymm0
1184 ; AVX2-NEXT: vmovaps (%rdi), %xmm12
1185 ; AVX2-NEXT: vmovaps 16(%rdi), %xmm13
1186 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm12[0],xmm13[0],xmm12[1],xmm13[1]
1187 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3]
1188 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1189 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1190 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1,5,1,5,1,5,1,5]
1191 ; AVX2-NEXT: vpermps %ymm5, %ymm0, %ymm1
1192 ; AVX2-NEXT: vpermps %ymm4, %ymm0, %ymm6
1193 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
1194 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm14
1195 ; AVX2-NEXT: vmovaps 48(%rdi), %xmm6
1196 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm9 = xmm14[0],xmm6[0],xmm14[1],xmm6[1]
1197 ; AVX2-NEXT: vpermps %ymm3, %ymm0, %ymm15
1198 ; AVX2-NEXT: vblendps {{.*#+}} xmm9 = xmm15[0,1],xmm9[2,3]
1199 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7]
1200 ; AVX2-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill
1201 ; AVX2-NEXT: vpermps %ymm2, %ymm0, %ymm1
1202 ; AVX2-NEXT: vpermps %ymm7, %ymm0, %ymm9
1203 ; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm1[6,7]
1204 ; AVX2-NEXT: vmovaps 128(%rdi), %ymm15
1205 ; AVX2-NEXT: vpermps %ymm15, %ymm0, %ymm0
1206 ; AVX2-NEXT: vmovaps 176(%rdi), %xmm3
1207 ; AVX2-NEXT: vmovaps 160(%rdi), %xmm1
1208 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm8 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
1209 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3]
1210 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
1211 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1212 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,6,2,6,2,6,2,6]
1213 ; AVX2-NEXT: vpermps %ymm2, %ymm0, %ymm8
1214 ; AVX2-NEXT: vpermps %ymm7, %ymm0, %ymm9
1215 ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
1216 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm9 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
1217 ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
1218 ; AVX2-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3]
1219 ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1220 ; AVX2-NEXT: vpermps %ymm5, %ymm0, %ymm9
1221 ; AVX2-NEXT: vpermps %ymm4, %ymm0, %ymm10
1222 ; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
1223 ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1224 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm10 = xmm12[2],xmm13[2],xmm12[3],xmm13[3]
1225 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
1226 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
1227 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm9 = [3,7,3,7,3,7,3,7]
1228 ; AVX2-NEXT: vpermps %ymm5, %ymm9, %ymm5
1229 ; AVX2-NEXT: vpermps %ymm4, %ymm9, %ymm4
1230 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
1231 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm5 = xmm14[2],xmm6[2],xmm14[3],xmm6[3]
1232 ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload
1233 ; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
1234 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
1235 ; AVX2-NEXT: vpermps %ymm2, %ymm9, %ymm2
1236 ; AVX2-NEXT: vpermps %ymm7, %ymm9, %ymm5
1237 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
1238 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
1239 ; AVX2-NEXT: vpermps %ymm15, %ymm9, %ymm3
1240 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
1241 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
1242 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1243 ; AVX2-NEXT: vmovaps %ymm2, 32(%rsi)
1244 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1245 ; AVX2-NEXT: vmovaps %ymm2, (%rsi)
1246 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1247 ; AVX2-NEXT: vmovaps %ymm2, 32(%rdx)
1248 ; AVX2-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload
1249 ; AVX2-NEXT: vmovaps %ymm2, (%rdx)
1250 ; AVX2-NEXT: vmovaps %ymm8, 32(%rcx)
1251 ; AVX2-NEXT: vmovaps %ymm0, (%rcx)
1252 ; AVX2-NEXT: vmovaps %ymm1, 32(%r8)
1253 ; AVX2-NEXT: vmovaps %ymm4, (%r8)
1254 ; AVX2-NEXT: addq $104, %rsp
1255 ; AVX2-NEXT: vzeroupper
1258 ; AVX2-FP-LABEL: load_i32_stride4_vf16:
1260 ; AVX2-FP-NEXT: subq $104, %rsp
1261 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm3
1262 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1263 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm8
1264 ; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1265 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm4
1266 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm5
1267 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm9
1268 ; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1269 ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm7
1270 ; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm2
1271 ; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm0 = [0,4,0,4,0,4,0,4]
1272 ; AVX2-FP-NEXT: vpermps %ymm2, %ymm0, %ymm1
1273 ; AVX2-FP-NEXT: vpermps %ymm7, %ymm0, %ymm6
1274 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
1275 ; AVX2-FP-NEXT: vmovaps 144(%rdi), %xmm10
1276 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %xmm11
1277 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
1278 ; AVX2-FP-NEXT: vpermps %ymm9, %ymm0, %ymm9
1279 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,3]
1280 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
1281 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1282 ; AVX2-FP-NEXT: vpermps %ymm5, %ymm0, %ymm1
1283 ; AVX2-FP-NEXT: vpermps %ymm4, %ymm0, %ymm6
1284 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
1285 ; AVX2-FP-NEXT: vpermps %ymm8, %ymm0, %ymm0
1286 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm12
1287 ; AVX2-FP-NEXT: vmovaps 16(%rdi), %xmm13
1288 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm12[0],xmm13[0],xmm12[1],xmm13[1]
1289 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3]
1290 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1291 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1292 ; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1,5,1,5,1,5,1,5]
1293 ; AVX2-FP-NEXT: vpermps %ymm5, %ymm0, %ymm1
1294 ; AVX2-FP-NEXT: vpermps %ymm4, %ymm0, %ymm6
1295 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
1296 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm14
1297 ; AVX2-FP-NEXT: vmovaps 48(%rdi), %xmm6
1298 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm9 = xmm14[0],xmm6[0],xmm14[1],xmm6[1]
1299 ; AVX2-FP-NEXT: vpermps %ymm3, %ymm0, %ymm15
1300 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm9 = xmm15[0,1],xmm9[2,3]
1301 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7]
1302 ; AVX2-FP-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill
1303 ; AVX2-FP-NEXT: vpermps %ymm2, %ymm0, %ymm1
1304 ; AVX2-FP-NEXT: vpermps %ymm7, %ymm0, %ymm9
1305 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm1[6,7]
1306 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm15
1307 ; AVX2-FP-NEXT: vpermps %ymm15, %ymm0, %ymm0
1308 ; AVX2-FP-NEXT: vmovaps 176(%rdi), %xmm3
1309 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %xmm1
1310 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm8 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
1311 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3]
1312 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
1313 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1314 ; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,6,2,6,2,6,2,6]
1315 ; AVX2-FP-NEXT: vpermps %ymm2, %ymm0, %ymm8
1316 ; AVX2-FP-NEXT: vpermps %ymm7, %ymm0, %ymm9
1317 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
1318 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm9 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
1319 ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
1320 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3]
1321 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1322 ; AVX2-FP-NEXT: vpermps %ymm5, %ymm0, %ymm9
1323 ; AVX2-FP-NEXT: vpermps %ymm4, %ymm0, %ymm10
1324 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
1325 ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1326 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm10 = xmm12[2],xmm13[2],xmm12[3],xmm13[3]
1327 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
1328 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
1329 ; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm9 = [3,7,3,7,3,7,3,7]
1330 ; AVX2-FP-NEXT: vpermps %ymm5, %ymm9, %ymm5
1331 ; AVX2-FP-NEXT: vpermps %ymm4, %ymm9, %ymm4
1332 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
1333 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm5 = xmm14[2],xmm6[2],xmm14[3],xmm6[3]
1334 ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload
1335 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
1336 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
1337 ; AVX2-FP-NEXT: vpermps %ymm2, %ymm9, %ymm2
1338 ; AVX2-FP-NEXT: vpermps %ymm7, %ymm9, %ymm5
1339 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
1340 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
1341 ; AVX2-FP-NEXT: vpermps %ymm15, %ymm9, %ymm3
1342 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
1343 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
1344 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1345 ; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rsi)
1346 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1347 ; AVX2-FP-NEXT: vmovaps %ymm2, (%rsi)
1348 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1349 ; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rdx)
1350 ; AVX2-FP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload
1351 ; AVX2-FP-NEXT: vmovaps %ymm2, (%rdx)
1352 ; AVX2-FP-NEXT: vmovaps %ymm8, 32(%rcx)
1353 ; AVX2-FP-NEXT: vmovaps %ymm0, (%rcx)
1354 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%r8)
1355 ; AVX2-FP-NEXT: vmovaps %ymm4, (%r8)
1356 ; AVX2-FP-NEXT: addq $104, %rsp
1357 ; AVX2-FP-NEXT: vzeroupper
1358 ; AVX2-FP-NEXT: retq
1360 ; AVX2-FCP-LABEL: load_i32_stride4_vf16:
1361 ; AVX2-FCP: # %bb.0:
1362 ; AVX2-FCP-NEXT: subq $104, %rsp
1363 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm3
1364 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1365 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm8
1366 ; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1367 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm4
1368 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm5
1369 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm9
1370 ; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1371 ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm7
1372 ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm2
1373 ; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm0 = [0,4,0,4,0,4,0,4]
1374 ; AVX2-FCP-NEXT: vpermps %ymm2, %ymm0, %ymm1
1375 ; AVX2-FCP-NEXT: vpermps %ymm7, %ymm0, %ymm6
1376 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
1377 ; AVX2-FCP-NEXT: vmovaps 144(%rdi), %xmm10
1378 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %xmm11
1379 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
1380 ; AVX2-FCP-NEXT: vpermps %ymm9, %ymm0, %ymm9
1381 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,3]
1382 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
1383 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1384 ; AVX2-FCP-NEXT: vpermps %ymm5, %ymm0, %ymm1
1385 ; AVX2-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm6
1386 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
1387 ; AVX2-FCP-NEXT: vpermps %ymm8, %ymm0, %ymm0
1388 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm12
1389 ; AVX2-FCP-NEXT: vmovaps 16(%rdi), %xmm13
1390 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm12[0],xmm13[0],xmm12[1],xmm13[1]
1391 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3]
1392 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1393 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1394 ; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1,5,1,5,1,5,1,5]
1395 ; AVX2-FCP-NEXT: vpermps %ymm5, %ymm0, %ymm1
1396 ; AVX2-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm6
1397 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
1398 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm14
1399 ; AVX2-FCP-NEXT: vmovaps 48(%rdi), %xmm6
1400 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm9 = xmm14[0],xmm6[0],xmm14[1],xmm6[1]
1401 ; AVX2-FCP-NEXT: vpermps %ymm3, %ymm0, %ymm15
1402 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm9 = xmm15[0,1],xmm9[2,3]
1403 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7]
1404 ; AVX2-FCP-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill
1405 ; AVX2-FCP-NEXT: vpermps %ymm2, %ymm0, %ymm1
1406 ; AVX2-FCP-NEXT: vpermps %ymm7, %ymm0, %ymm9
1407 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm1[6,7]
1408 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm15
1409 ; AVX2-FCP-NEXT: vpermps %ymm15, %ymm0, %ymm0
1410 ; AVX2-FCP-NEXT: vmovaps 176(%rdi), %xmm3
1411 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %xmm1
1412 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm8 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
1413 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3]
1414 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
1415 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1416 ; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,6,2,6,2,6,2,6]
1417 ; AVX2-FCP-NEXT: vpermps %ymm2, %ymm0, %ymm8
1418 ; AVX2-FCP-NEXT: vpermps %ymm7, %ymm0, %ymm9
1419 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
1420 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm9 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
1421 ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
1422 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3]
1423 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1424 ; AVX2-FCP-NEXT: vpermps %ymm5, %ymm0, %ymm9
1425 ; AVX2-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm10
1426 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
1427 ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1428 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm10 = xmm12[2],xmm13[2],xmm12[3],xmm13[3]
1429 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
1430 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
1431 ; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm9 = [3,7,3,7,3,7,3,7]
1432 ; AVX2-FCP-NEXT: vpermps %ymm5, %ymm9, %ymm5
1433 ; AVX2-FCP-NEXT: vpermps %ymm4, %ymm9, %ymm4
1434 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
1435 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm5 = xmm14[2],xmm6[2],xmm14[3],xmm6[3]
1436 ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload
1437 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
1438 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
1439 ; AVX2-FCP-NEXT: vpermps %ymm2, %ymm9, %ymm2
1440 ; AVX2-FCP-NEXT: vpermps %ymm7, %ymm9, %ymm5
1441 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
1442 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
1443 ; AVX2-FCP-NEXT: vpermps %ymm15, %ymm9, %ymm3
1444 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
1445 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
1446 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1447 ; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rsi)
1448 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1449 ; AVX2-FCP-NEXT: vmovaps %ymm2, (%rsi)
1450 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1451 ; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rdx)
1452 ; AVX2-FCP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload
1453 ; AVX2-FCP-NEXT: vmovaps %ymm2, (%rdx)
1454 ; AVX2-FCP-NEXT: vmovaps %ymm8, 32(%rcx)
1455 ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rcx)
1456 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%r8)
1457 ; AVX2-FCP-NEXT: vmovaps %ymm4, (%r8)
1458 ; AVX2-FCP-NEXT: addq $104, %rsp
1459 ; AVX2-FCP-NEXT: vzeroupper
1460 ; AVX2-FCP-NEXT: retq
1462 ; AVX512-LABEL: load_i32_stride4_vf16:
1464 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
1465 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1
1466 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2
1467 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3
1468 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28]
1469 ; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
1470 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5
1471 ; AVX512-NEXT: vpermt2d %zmm3, %zmm4, %zmm5
1472 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm4
1473 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
1474 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29]
1475 ; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
1476 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm6
1477 ; AVX512-NEXT: vpermt2d %zmm3, %zmm5, %zmm6
1478 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm5
1479 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7]
1480 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30]
1481 ; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
1482 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7
1483 ; AVX512-NEXT: vpermt2d %zmm3, %zmm6, %zmm7
1484 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm6
1485 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7]
1486 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31]
1487 ; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
1488 ; AVX512-NEXT: vpermt2d %zmm3, %zmm7, %zmm2
1489 ; AVX512-NEXT: vpermt2d %zmm1, %zmm7, %zmm0
1490 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
1491 ; AVX512-NEXT: vmovdqa64 %zmm4, (%rsi)
1492 ; AVX512-NEXT: vmovdqa64 %zmm5, (%rdx)
1493 ; AVX512-NEXT: vmovdqa64 %zmm6, (%rcx)
1494 ; AVX512-NEXT: vmovdqa64 %zmm0, (%r8)
1495 ; AVX512-NEXT: vzeroupper
1498 ; AVX512-FCP-LABEL: load_i32_stride4_vf16:
1499 ; AVX512-FCP: # %bb.0:
1500 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1501 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
1502 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
1503 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
1504 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28]
1505 ; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
1506 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5
1507 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm5
1508 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4
1509 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
1510 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29]
1511 ; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
1512 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm6
1513 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm6
1514 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5
1515 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7]
1516 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30]
1517 ; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
1518 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7
1519 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm7
1520 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6
1521 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7]
1522 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31]
1523 ; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
1524 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm2
1525 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm7, %zmm0
1526 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
1527 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rsi)
1528 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rdx)
1529 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%rcx)
1530 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
1531 ; AVX512-FCP-NEXT: vzeroupper
1532 ; AVX512-FCP-NEXT: retq
1534 ; AVX512DQ-LABEL: load_i32_stride4_vf16:
1535 ; AVX512DQ: # %bb.0:
1536 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0
1537 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1
1538 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2
1539 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3
1540 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28]
1541 ; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
1542 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5
1543 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm4, %zmm5
1544 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm4
1545 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
1546 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29]
1547 ; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
1548 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm6
1549 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm5, %zmm6
1550 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm5
1551 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7]
1552 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30]
1553 ; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
1554 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm7
1555 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm6, %zmm7
1556 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm6
1557 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7]
1558 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31]
1559 ; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
1560 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm7, %zmm2
1561 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm7, %zmm0
1562 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
1563 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rsi)
1564 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rdx)
1565 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, (%rcx)
1566 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r8)
1567 ; AVX512DQ-NEXT: vzeroupper
1568 ; AVX512DQ-NEXT: retq
1570 ; AVX512DQ-FCP-LABEL: load_i32_stride4_vf16:
1571 ; AVX512DQ-FCP: # %bb.0:
1572 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1573 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
1574 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
1575 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
1576 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28]
1577 ; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
1578 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5
1579 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm5
1580 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4
1581 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
1582 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29]
1583 ; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
1584 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm6
1585 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm6
1586 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5
1587 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7]
1588 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30]
1589 ; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
1590 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7
1591 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm7
1592 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6
1593 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7]
1594 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31]
1595 ; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
1596 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm2
1597 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm7, %zmm0
1598 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
1599 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rsi)
1600 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rdx)
1601 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%rcx)
1602 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
1603 ; AVX512DQ-FCP-NEXT: vzeroupper
1604 ; AVX512DQ-FCP-NEXT: retq
1606 ; AVX512BW-LABEL: load_i32_stride4_vf16:
1607 ; AVX512BW: # %bb.0:
1608 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1609 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
1610 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2
1611 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3
1612 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28]
1613 ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
1614 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5
1615 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm4, %zmm5
1616 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4
1617 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
1618 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29]
1619 ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
1620 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6
1621 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm5, %zmm6
1622 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5
1623 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7]
1624 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30]
1625 ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
1626 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7
1627 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm7
1628 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6
1629 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7]
1630 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31]
1631 ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
1632 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm7, %zmm2
1633 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm7, %zmm0
1634 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
1635 ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rsi)
1636 ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rdx)
1637 ; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rcx)
1638 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r8)
1639 ; AVX512BW-NEXT: vzeroupper
1640 ; AVX512BW-NEXT: retq
1642 ; AVX512BW-FCP-LABEL: load_i32_stride4_vf16:
1643 ; AVX512BW-FCP: # %bb.0:
1644 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1645 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
1646 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
1647 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
1648 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28]
1649 ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
1650 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5
1651 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm5
1652 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4
1653 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
1654 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29]
1655 ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
1656 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6
1657 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm6
1658 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5
1659 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7]
1660 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30]
1661 ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
1662 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7
1663 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm7
1664 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6
1665 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7]
1666 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31]
1667 ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
1668 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm2
1669 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm7, %zmm0
1670 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
1671 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi)
1672 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx)
1673 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%rcx)
1674 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
1675 ; AVX512BW-FCP-NEXT: vzeroupper
1676 ; AVX512BW-FCP-NEXT: retq
1678 ; AVX512DQ-BW-LABEL: load_i32_stride4_vf16:
1679 ; AVX512DQ-BW: # %bb.0:
1680 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
1681 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1
1682 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2
1683 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3
1684 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28]
1685 ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
1686 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5
1687 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm4, %zmm5
1688 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4
1689 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
1690 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29]
1691 ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
1692 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm6
1693 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm5, %zmm6
1694 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5
1695 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7]
1696 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30]
1697 ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
1698 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7
1699 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm7
1700 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6
1701 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7]
1702 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31]
1703 ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
1704 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm7, %zmm2
1705 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm7, %zmm0
1706 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
1707 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rsi)
1708 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rdx)
1709 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, (%rcx)
1710 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%r8)
1711 ; AVX512DQ-BW-NEXT: vzeroupper
1712 ; AVX512DQ-BW-NEXT: retq
1714 ; AVX512DQ-BW-FCP-LABEL: load_i32_stride4_vf16:
1715 ; AVX512DQ-BW-FCP: # %bb.0:
1716 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1717 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
1718 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
1719 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
1720 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28]
1721 ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
1722 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5
1723 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm5
1724 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4
1725 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
1726 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29]
1727 ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
1728 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6
1729 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm6
1730 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5
1731 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7]
1732 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30]
1733 ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
1734 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7
1735 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm7
1736 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6
1737 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7]
1738 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31]
1739 ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
1740 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm2
1741 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm7, %zmm0
1742 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
1743 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi)
1744 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx)
1745 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%rcx)
1746 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
1747 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
1748 ; AVX512DQ-BW-FCP-NEXT: retq
1749 %wide.vec = load <64 x i32>, ptr %in.vec, align 64
1750 %strided.vec0 = shufflevector <64 x i32> %wide.vec, <64 x i32> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
1751 %strided.vec1 = shufflevector <64 x i32> %wide.vec, <64 x i32> poison, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
1752 %strided.vec2 = shufflevector <64 x i32> %wide.vec, <64 x i32> poison, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
1753 %strided.vec3 = shufflevector <64 x i32> %wide.vec, <64 x i32> poison, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
1754 store <16 x i32> %strided.vec0, ptr %out.vec0, align 64
1755 store <16 x i32> %strided.vec1, ptr %out.vec1, align 64
1756 store <16 x i32> %strided.vec2, ptr %out.vec2, align 64
1757 store <16 x i32> %strided.vec3, ptr %out.vec3, align 64
1761 define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind {
1762 ; SSE-LABEL: load_i32_stride4_vf32:
1764 ; SSE-NEXT: subq $456, %rsp # imm = 0x1C8
1765 ; SSE-NEXT: movaps 272(%rdi), %xmm7
1766 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1767 ; SSE-NEXT: movaps 304(%rdi), %xmm8
1768 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1769 ; SSE-NEXT: movaps 288(%rdi), %xmm2
1770 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1771 ; SSE-NEXT: movaps 336(%rdi), %xmm10
1772 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1773 ; SSE-NEXT: movaps 320(%rdi), %xmm6
1774 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1775 ; SSE-NEXT: movaps 368(%rdi), %xmm11
1776 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1777 ; SSE-NEXT: movaps 352(%rdi), %xmm5
1778 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1779 ; SSE-NEXT: movaps 80(%rdi), %xmm9
1780 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1781 ; SSE-NEXT: movaps 64(%rdi), %xmm4
1782 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1783 ; SSE-NEXT: movaps 112(%rdi), %xmm3
1784 ; SSE-NEXT: movaps 96(%rdi), %xmm0
1785 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1786 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
1787 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1]
1788 ; SSE-NEXT: movaps %xmm4, %xmm1
1789 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1790 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1791 ; SSE-NEXT: movaps %xmm5, %xmm1
1792 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1]
1793 ; SSE-NEXT: movaps %xmm6, %xmm5
1794 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1]
1795 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
1796 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1797 ; SSE-NEXT: movaps %xmm5, %xmm0
1798 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1799 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1800 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
1801 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1802 ; SSE-NEXT: movaps %xmm2, %xmm0
1803 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
1804 ; SSE-NEXT: movaps 256(%rdi), %xmm1
1805 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1806 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
1807 ; SSE-NEXT: movaps %xmm1, %xmm2
1808 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1809 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1810 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1811 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1812 ; SSE-NEXT: movaps 240(%rdi), %xmm1
1813 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1814 ; SSE-NEXT: movaps 224(%rdi), %xmm13
1815 ; SSE-NEXT: movaps %xmm13, %xmm0
1816 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1817 ; SSE-NEXT: movaps 208(%rdi), %xmm2
1818 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1819 ; SSE-NEXT: movaps 192(%rdi), %xmm1
1820 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1821 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1822 ; SSE-NEXT: movaps %xmm1, %xmm2
1823 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1824 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1825 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1826 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1827 ; SSE-NEXT: movaps 496(%rdi), %xmm1
1828 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1829 ; SSE-NEXT: movaps 480(%rdi), %xmm0
1830 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1831 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1832 ; SSE-NEXT: movaps 464(%rdi), %xmm2
1833 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1834 ; SSE-NEXT: movaps 448(%rdi), %xmm1
1835 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1836 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1837 ; SSE-NEXT: movaps %xmm1, %xmm2
1838 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1839 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1840 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1841 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1842 ; SSE-NEXT: movaps 176(%rdi), %xmm1
1843 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1844 ; SSE-NEXT: movaps 160(%rdi), %xmm4
1845 ; SSE-NEXT: movaps %xmm4, %xmm0
1846 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1847 ; SSE-NEXT: movaps 144(%rdi), %xmm1
1848 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1849 ; SSE-NEXT: movaps 128(%rdi), %xmm15
1850 ; SSE-NEXT: movaps %xmm15, %xmm14
1851 ; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
1852 ; SSE-NEXT: movaps %xmm14, %xmm1
1853 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1854 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1855 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1]
1856 ; SSE-NEXT: movaps 432(%rdi), %xmm1
1857 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1858 ; SSE-NEXT: movaps 416(%rdi), %xmm10
1859 ; SSE-NEXT: movaps %xmm10, %xmm0
1860 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1861 ; SSE-NEXT: movaps 400(%rdi), %xmm1
1862 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1863 ; SSE-NEXT: movaps 384(%rdi), %xmm12
1864 ; SSE-NEXT: movaps %xmm12, %xmm7
1865 ; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
1866 ; SSE-NEXT: movaps %xmm7, %xmm1
1867 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1868 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1869 ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1]
1870 ; SSE-NEXT: movaps 32(%rdi), %xmm11
1871 ; SSE-NEXT: movaps 48(%rdi), %xmm0
1872 ; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
1873 ; SSE-NEXT: movaps %xmm11, %xmm8
1874 ; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
1875 ; SSE-NEXT: movaps (%rdi), %xmm6
1876 ; SSE-NEXT: movaps 16(%rdi), %xmm9
1877 ; SSE-NEXT: movaps %xmm6, %xmm5
1878 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1]
1879 ; SSE-NEXT: movaps %xmm5, %xmm0
1880 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm8[0]
1881 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1882 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1]
1883 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1884 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
1885 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1886 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
1887 ; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
1888 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
1889 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
1890 ; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3]
1891 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1892 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
1893 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
1894 ; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3]
1895 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1896 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1897 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1898 ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
1899 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1900 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1901 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
1902 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
1903 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
1904 ; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3]
1905 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1906 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1907 ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
1908 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
1909 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
1910 ; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3]
1911 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1912 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
1913 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
1914 ; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3]
1915 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1916 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
1917 ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
1918 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
1919 ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3]
1920 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
1921 ; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3]
1922 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
1923 ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3]
1924 ; SSE-NEXT: unpckhps (%rsp), %xmm11 # 16-byte Folded Reload
1925 ; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3]
1926 ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm9[2],xmm6[3],xmm9[3]
1927 ; SSE-NEXT: movaps %xmm3, %xmm8
1928 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0]
1929 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
1930 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1931 ; SSE-NEXT: movaps %xmm15, %xmm9
1932 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm4[0]
1933 ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm4[1]
1934 ; SSE-NEXT: movaps %xmm1, %xmm4
1935 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm13[0]
1936 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1]
1937 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1938 ; SSE-NEXT: movaps %xmm2, %xmm0
1939 ; SSE-NEXT: movaps %xmm2, %xmm13
1940 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1941 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm2[0]
1942 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
1943 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1944 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1945 ; SSE-NEXT: movaps %xmm1, %xmm0
1946 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1947 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
1948 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
1949 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1950 ; SSE-NEXT: movaps %xmm12, %xmm3
1951 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm10[0]
1952 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm10[1]
1953 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1954 ; SSE-NEXT: movaps %xmm1, %xmm10
1955 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1956 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm2[0]
1957 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
1958 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1959 ; SSE-NEXT: movaps %xmm6, %xmm2
1960 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm11[0]
1961 ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm11[1]
1962 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1963 ; SSE-NEXT: movaps %xmm1, 96(%rsi)
1964 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1965 ; SSE-NEXT: movaps %xmm1, 32(%rsi)
1966 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1967 ; SSE-NEXT: movaps %xmm1, 112(%rsi)
1968 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
1969 ; SSE-NEXT: movaps %xmm11, 48(%rsi)
1970 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
1971 ; SSE-NEXT: movaps %xmm11, 64(%rsi)
1972 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1973 ; SSE-NEXT: movaps %xmm1, (%rsi)
1974 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
1975 ; SSE-NEXT: movaps %xmm11, 80(%rsi)
1976 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
1977 ; SSE-NEXT: movaps %xmm11, 16(%rsi)
1978 ; SSE-NEXT: movaps %xmm7, 96(%rdx)
1979 ; SSE-NEXT: movaps %xmm14, 32(%rdx)
1980 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
1981 ; SSE-NEXT: movaps %xmm7, 112(%rdx)
1982 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
1983 ; SSE-NEXT: movaps %xmm7, 48(%rdx)
1984 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
1985 ; SSE-NEXT: movaps %xmm7, 64(%rdx)
1986 ; SSE-NEXT: movaps %xmm5, (%rdx)
1987 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1988 ; SSE-NEXT: movaps %xmm1, 80(%rdx)
1989 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1990 ; SSE-NEXT: movaps %xmm1, 16(%rdx)
1991 ; SSE-NEXT: movaps %xmm3, 96(%rcx)
1992 ; SSE-NEXT: movaps %xmm9, 32(%rcx)
1993 ; SSE-NEXT: movaps %xmm10, 112(%rcx)
1994 ; SSE-NEXT: movaps %xmm4, 48(%rcx)
1995 ; SSE-NEXT: movaps %xmm13, 64(%rcx)
1996 ; SSE-NEXT: movaps %xmm2, (%rcx)
1997 ; SSE-NEXT: movaps %xmm0, 80(%rcx)
1998 ; SSE-NEXT: movaps %xmm8, 16(%rcx)
1999 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2000 ; SSE-NEXT: movaps %xmm0, 112(%r8)
2001 ; SSE-NEXT: movaps %xmm12, 96(%r8)
2002 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2003 ; SSE-NEXT: movaps %xmm0, 80(%r8)
2004 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2005 ; SSE-NEXT: movaps %xmm0, 64(%r8)
2006 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2007 ; SSE-NEXT: movaps %xmm0, 48(%r8)
2008 ; SSE-NEXT: movaps %xmm15, 32(%r8)
2009 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2010 ; SSE-NEXT: movaps %xmm0, 16(%r8)
2011 ; SSE-NEXT: movaps %xmm6, (%r8)
2012 ; SSE-NEXT: addq $456, %rsp # imm = 0x1C8
2015 ; AVX-LABEL: load_i32_stride4_vf32:
2017 ; AVX-NEXT: subq $1000, %rsp # imm = 0x3E8
2018 ; AVX-NEXT: vmovaps 448(%rdi), %ymm3
2019 ; AVX-NEXT: vmovaps 480(%rdi), %ymm4
2020 ; AVX-NEXT: vmovaps 320(%rdi), %ymm2
2021 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2022 ; AVX-NEXT: vmovaps 352(%rdi), %ymm10
2023 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm10[2,3,0,1]
2024 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm15[0],ymm10[2],ymm15[2]
2025 ; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2026 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1]
2027 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2028 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5]
2029 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
2030 ; AVX-NEXT: vmovaps 288(%rdi), %xmm2
2031 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2032 ; AVX-NEXT: vmovaps 304(%rdi), %xmm1
2033 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2034 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2035 ; AVX-NEXT: vmovaps 272(%rdi), %xmm9
2036 ; AVX-NEXT: vmovaps 256(%rdi), %xmm2
2037 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2038 ; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1]
2039 ; AVX-NEXT: vmovaps %xmm9, (%rsp) # 16-byte Spill
2040 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0]
2041 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2042 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2043 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2044 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1]
2045 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2046 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[2],ymm0[2]
2047 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2048 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1]
2049 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2050 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5]
2051 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
2052 ; AVX-NEXT: vmovaps 416(%rdi), %xmm2
2053 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2054 ; AVX-NEXT: vmovaps 432(%rdi), %xmm1
2055 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2056 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2057 ; AVX-NEXT: vmovaps 400(%rdi), %xmm3
2058 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2059 ; AVX-NEXT: vmovaps 384(%rdi), %xmm2
2060 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2061 ; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
2062 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0]
2063 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2064 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2065 ; AVX-NEXT: vmovaps 192(%rdi), %ymm11
2066 ; AVX-NEXT: vmovaps 224(%rdi), %ymm13
2067 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm13[2,3,0,1]
2068 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm8[0],ymm13[2],ymm8[2]
2069 ; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2070 ; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2071 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3,0,1]
2072 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2073 ; AVX-NEXT: vunpcklps {{.*#+}} ymm2 = ymm1[0],ymm11[0],ymm1[1],ymm11[1],ymm1[4],ymm11[4],ymm1[5],ymm11[5]
2074 ; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2075 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4]
2076 ; AVX-NEXT: vmovaps 160(%rdi), %xmm1
2077 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2078 ; AVX-NEXT: vmovaps 176(%rdi), %xmm2
2079 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2080 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
2081 ; AVX-NEXT: vmovaps 144(%rdi), %xmm4
2082 ; AVX-NEXT: vmovaps 128(%rdi), %xmm12
2083 ; AVX-NEXT: vunpcklps {{.*#+}} xmm5 = xmm12[0],xmm4[0],xmm12[1],xmm4[1]
2084 ; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2085 ; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2086 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,0]
2087 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
2088 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2089 ; AVX-NEXT: vmovaps 64(%rdi), %ymm0
2090 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2091 ; AVX-NEXT: vmovaps 96(%rdi), %ymm2
2092 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2093 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1]
2094 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2095 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
2096 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
2097 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2098 ; AVX-NEXT: vunpcklps {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
2099 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1],ymm5[2,0],ymm7[4,5],ymm5[6,4]
2100 ; AVX-NEXT: vmovaps (%rdi), %xmm2
2101 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2102 ; AVX-NEXT: vmovaps 16(%rdi), %xmm7
2103 ; AVX-NEXT: vmovaps 32(%rdi), %xmm0
2104 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2105 ; AVX-NEXT: vmovaps 48(%rdi), %xmm5
2106 ; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2107 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm0[0]
2108 ; AVX-NEXT: vunpcklps {{.*#+}} xmm14 = xmm2[0],xmm7[0],xmm2[1],xmm7[1]
2109 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,0]
2110 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2111 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2112 ; AVX-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2113 ; AVX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm15[0],ymm10[0],ymm15[1],ymm10[1],ymm15[4],ymm10[4],ymm15[5],ymm10[5]
2114 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
2115 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
2116 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,0],ymm10[1,0],ymm6[5,4],ymm10[5,4]
2117 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
2118 ; AVX-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload
2119 ; AVX-NEXT: # xmm1 = mem[0],xmm9[1],zero,zero
2120 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
2121 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm14 # 16-byte Folded Reload
2122 ; AVX-NEXT: # xmm14 = xmm9[0],mem[0],xmm9[1],mem[1]
2123 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
2124 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2125 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2126 ; AVX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm13[0],ymm8[1],ymm13[1],ymm8[4],ymm13[4],ymm8[5],ymm13[5]
2127 ; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload
2128 ; AVX-NEXT: # ymm1 = ymm11[1,0],mem[1,0],ymm11[5,4],mem[5,4]
2129 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
2130 ; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm12[1],xmm4[1],zero,zero
2131 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2132 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload
2133 ; AVX-NEXT: # xmm14 = xmm2[0],mem[0],xmm2[1],mem[1]
2134 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
2135 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2136 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2137 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2138 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2139 ; AVX-NEXT: vunpcklps {{.*#+}} ymm14 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[4],ymm2[4],ymm0[5],ymm2[5]
2140 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2141 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2142 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm3[1,0],ymm1[5,4],ymm3[5,4]
2143 ; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm1[2,0],ymm14[2,3],ymm1[6,4],ymm14[6,7]
2144 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2145 ; AVX-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm1 # 16-byte Folded Reload
2146 ; AVX-NEXT: # xmm1 = mem[0],xmm4[1],zero,zero
2147 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2148 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload
2149 ; AVX-NEXT: # xmm14 = xmm5[0],mem[0],xmm5[1],mem[1]
2150 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
2151 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
2152 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2153 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2154 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload
2155 ; AVX-NEXT: # ymm8 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5]
2156 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
2157 ; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload
2158 ; AVX-NEXT: # ymm1 = ymm11[1,0],mem[1,0],ymm11[5,4],mem[5,4]
2159 ; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm1[2,0],ymm8[2,3],ymm1[6,4],ymm8[6,7]
2160 ; AVX-NEXT: vmovaps %xmm7, %xmm12
2161 ; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2162 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
2163 ; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[1],xmm12[1],zero,zero
2164 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
2165 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
2166 ; AVX-NEXT: vunpcklps {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
2167 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
2168 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
2169 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2170 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2171 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm1[1],ymm15[1],ymm1[3],ymm15[3]
2172 ; AVX-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[6],ymm6[6],ymm10[7],ymm6[7]
2173 ; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm1[0,1],ymm8[2,0],ymm1[4,5],ymm8[6,4]
2174 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2175 ; AVX-NEXT: vunpckhps (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload
2176 ; AVX-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
2177 ; AVX-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm14 # 16-byte Folded Reload
2178 ; AVX-NEXT: # xmm14 = zero,zero,xmm9[2],mem[0]
2179 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
2180 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
2181 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2182 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
2183 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload
2184 ; AVX-NEXT: # ymm1 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7]
2185 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm8[2,0],ymm1[4,5],ymm8[6,4]
2186 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2187 ; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
2188 ; AVX-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload
2189 ; AVX-NEXT: # xmm14 = zero,zero,xmm5[2],mem[0]
2190 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
2191 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2192 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2193 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
2194 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
2195 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm6[1],ymm5[3],ymm6[3]
2196 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
2197 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2198 ; AVX-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[6],ymm3[6],ymm8[7],ymm3[7]
2199 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
2200 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
2201 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
2202 ; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm10[2],xmm9[3],xmm10[3]
2203 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2204 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2205 ; AVX-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm2[2],xmm4[2]
2206 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
2207 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2208 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2209 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
2210 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2211 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3]
2212 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2213 ; AVX-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm11[2],ymm1[3],ymm11[3],ymm1[6],ymm11[6],ymm1[7],ymm11[7]
2214 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
2215 ; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm12[2],xmm7[3],xmm12[3]
2216 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
2217 ; AVX-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm7[2],xmm13[2]
2218 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
2219 ; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2220 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2221 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2222 ; AVX-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
2223 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2224 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
2225 ; AVX-NEXT: # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4]
2226 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
2227 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2228 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
2229 ; AVX-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
2230 ; AVX-NEXT: vmovaps (%rsp), %xmm12 # 16-byte Reload
2231 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
2232 ; AVX-NEXT: # xmm12 = xmm12[3,0],mem[3,0]
2233 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm12[2,0],xmm1[2,3]
2234 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2235 ; AVX-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7]
2236 ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0],ymm8[3,0],ymm3[7,4],ymm8[7,4]
2237 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,0],ymm1[2,3],ymm3[6,4],ymm1[6,7]
2238 ; AVX-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
2239 ; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,0],xmm9[3,0]
2240 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm6[2,0],xmm3[2,3]
2241 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
2242 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2243 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload
2244 ; AVX-NEXT: # ymm3 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7]
2245 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2246 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload
2247 ; AVX-NEXT: # ymm6 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4]
2248 ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm6[2,0],ymm3[2,3],ymm6[6,4],ymm3[6,7]
2249 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2250 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload
2251 ; AVX-NEXT: # xmm6 = xmm2[2],mem[2],xmm2[3],mem[3]
2252 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2253 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm8 # 16-byte Folded Reload
2254 ; AVX-NEXT: # xmm8 = xmm2[3,0],mem[3,0]
2255 ; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm8[2,0],xmm6[2,3]
2256 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
2257 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload
2258 ; AVX-NEXT: # ymm2 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7]
2259 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload
2260 ; AVX-NEXT: # ymm4 = ymm11[3,0],mem[3,0],ymm11[7,4],mem[7,4]
2261 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm4[2,0],ymm2[2,3],ymm4[6,4],ymm2[6,7]
2262 ; AVX-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm13[2],xmm7[3],xmm13[3]
2263 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2264 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
2265 ; AVX-NEXT: # xmm5 = xmm5[3,0],mem[3,0]
2266 ; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm5[2,0],xmm4[2,3]
2267 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
2268 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2269 ; AVX-NEXT: vmovaps %ymm4, 32(%rsi)
2270 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2271 ; AVX-NEXT: vmovaps %ymm4, 96(%rsi)
2272 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2273 ; AVX-NEXT: vmovaps %ymm4, 64(%rsi)
2274 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2275 ; AVX-NEXT: vmovaps %ymm4, (%rsi)
2276 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2277 ; AVX-NEXT: vmovaps %ymm4, 96(%rdx)
2278 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2279 ; AVX-NEXT: vmovaps %ymm4, 32(%rdx)
2280 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2281 ; AVX-NEXT: vmovaps %ymm4, (%rdx)
2282 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2283 ; AVX-NEXT: vmovaps %ymm4, 64(%rdx)
2284 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2285 ; AVX-NEXT: vmovaps %ymm4, 32(%rcx)
2286 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2287 ; AVX-NEXT: vmovaps %ymm4, 96(%rcx)
2288 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2289 ; AVX-NEXT: vmovaps %ymm4, 64(%rcx)
2290 ; AVX-NEXT: vmovaps %ymm14, (%rcx)
2291 ; AVX-NEXT: vmovaps %ymm3, 96(%r8)
2292 ; AVX-NEXT: vmovaps %ymm1, 32(%r8)
2293 ; AVX-NEXT: vmovaps %ymm0, 64(%r8)
2294 ; AVX-NEXT: vmovaps %ymm2, (%r8)
2295 ; AVX-NEXT: addq $1000, %rsp # imm = 0x3E8
2296 ; AVX-NEXT: vzeroupper
2299 ; AVX2-LABEL: load_i32_stride4_vf32:
2301 ; AVX2-NEXT: subq $680, %rsp # imm = 0x2A8
2302 ; AVX2-NEXT: vmovaps 192(%rdi), %ymm12
2303 ; AVX2-NEXT: vmovaps 416(%rdi), %ymm8
2304 ; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2305 ; AVX2-NEXT: vmovaps 448(%rdi), %ymm4
2306 ; AVX2-NEXT: vmovaps 480(%rdi), %ymm10
2307 ; AVX2-NEXT: vmovaps 288(%rdi), %ymm11
2308 ; AVX2-NEXT: vmovaps 320(%rdi), %ymm6
2309 ; AVX2-NEXT: vmovaps 352(%rdi), %ymm9
2310 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm0 = [0,4,0,4,0,4,0,4]
2311 ; AVX2-NEXT: vpermps %ymm9, %ymm0, %ymm1
2312 ; AVX2-NEXT: vpermps %ymm6, %ymm0, %ymm2
2313 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
2314 ; AVX2-NEXT: vmovaps 272(%rdi), %xmm3
2315 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2316 ; AVX2-NEXT: vmovaps 256(%rdi), %xmm2
2317 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2318 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
2319 ; AVX2-NEXT: vpermps %ymm11, %ymm0, %ymm3
2320 ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
2321 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
2322 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2323 ; AVX2-NEXT: vpermps %ymm10, %ymm0, %ymm1
2324 ; AVX2-NEXT: vpermps %ymm4, %ymm0, %ymm2
2325 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
2326 ; AVX2-NEXT: vmovaps 400(%rdi), %xmm3
2327 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2328 ; AVX2-NEXT: vmovaps 384(%rdi), %xmm2
2329 ; AVX2-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill
2330 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
2331 ; AVX2-NEXT: vpermps %ymm8, %ymm0, %ymm3
2332 ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
2333 ; AVX2-NEXT: vmovaps 224(%rdi), %ymm8
2334 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
2335 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2336 ; AVX2-NEXT: vpermps %ymm8, %ymm0, %ymm1
2337 ; AVX2-NEXT: vpermps %ymm12, %ymm0, %ymm2
2338 ; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2339 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
2340 ; AVX2-NEXT: vmovaps 160(%rdi), %ymm5
2341 ; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2342 ; AVX2-NEXT: vmovaps 144(%rdi), %xmm3
2343 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2344 ; AVX2-NEXT: vmovaps 128(%rdi), %xmm2
2345 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2346 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
2347 ; AVX2-NEXT: vpermps %ymm5, %ymm0, %ymm3
2348 ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
2349 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
2350 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2351 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm14
2352 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm15
2353 ; AVX2-NEXT: vpermps %ymm15, %ymm0, %ymm1
2354 ; AVX2-NEXT: vpermps %ymm14, %ymm0, %ymm2
2355 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
2356 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm2
2357 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2358 ; AVX2-NEXT: vpermps %ymm2, %ymm0, %ymm0
2359 ; AVX2-NEXT: vmovaps (%rdi), %xmm3
2360 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2361 ; AVX2-NEXT: vmovaps 16(%rdi), %xmm2
2362 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2363 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
2364 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
2365 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2366 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2367 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1,5,1,5,1,5,1,5]
2368 ; AVX2-NEXT: vpermps %ymm9, %ymm0, %ymm1
2369 ; AVX2-NEXT: vpermps %ymm6, %ymm0, %ymm2
2370 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
2371 ; AVX2-NEXT: vmovaps 256(%rdi), %ymm5
2372 ; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2373 ; AVX2-NEXT: vmovaps 304(%rdi), %xmm3
2374 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2375 ; AVX2-NEXT: vmovaps 288(%rdi), %xmm2
2376 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2377 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
2378 ; AVX2-NEXT: vpermps %ymm5, %ymm0, %ymm3
2379 ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
2380 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
2381 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2382 ; AVX2-NEXT: vpermps %ymm15, %ymm0, %ymm1
2383 ; AVX2-NEXT: vpermps %ymm14, %ymm0, %ymm2
2384 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
2385 ; AVX2-NEXT: vmovaps (%rdi), %ymm7
2386 ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2387 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm3
2388 ; AVX2-NEXT: vmovaps 48(%rdi), %xmm2
2389 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
2390 ; AVX2-NEXT: vpermps %ymm7, %ymm0, %ymm7
2391 ; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
2392 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
2393 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2394 ; AVX2-NEXT: vpermps %ymm8, %ymm0, %ymm1
2395 ; AVX2-NEXT: vpermps %ymm12, %ymm0, %ymm5
2396 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
2397 ; AVX2-NEXT: vmovaps 128(%rdi), %ymm12
2398 ; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2399 ; AVX2-NEXT: vmovaps 176(%rdi), %xmm7
2400 ; AVX2-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2401 ; AVX2-NEXT: vmovaps 160(%rdi), %xmm5
2402 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
2403 ; AVX2-NEXT: vpermps %ymm12, %ymm0, %ymm13
2404 ; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm13[0,1],xmm7[2,3]
2405 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
2406 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2407 ; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2408 ; AVX2-NEXT: vpermps %ymm10, %ymm0, %ymm1
2409 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2410 ; AVX2-NEXT: vpermps %ymm4, %ymm0, %ymm7
2411 ; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm7[0,1,2,3,4,5],ymm1[6,7]
2412 ; AVX2-NEXT: vmovaps 384(%rdi), %ymm1
2413 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2414 ; AVX2-NEXT: vpermps %ymm1, %ymm0, %ymm0
2415 ; AVX2-NEXT: vmovaps 432(%rdi), %xmm7
2416 ; AVX2-NEXT: vmovaps 416(%rdi), %xmm1
2417 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm12 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
2418 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3]
2419 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
2420 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2421 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,6,2,6,2,6,2,6]
2422 ; AVX2-NEXT: vpermps %ymm9, %ymm0, %ymm12
2423 ; AVX2-NEXT: vpermps %ymm6, %ymm0, %ymm13
2424 ; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
2425 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
2426 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
2427 ; AVX2-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3]
2428 ; AVX2-NEXT: vpermps %ymm11, %ymm0, %ymm11
2429 ; AVX2-NEXT: vblendps {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3]
2430 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
2431 ; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2432 ; AVX2-NEXT: vpermps %ymm10, %ymm0, %ymm11
2433 ; AVX2-NEXT: vpermps %ymm4, %ymm0, %ymm12
2434 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
2435 ; AVX2-NEXT: vmovaps (%rsp), %xmm4 # 16-byte Reload
2436 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm12 # 16-byte Folded Reload
2437 ; AVX2-NEXT: # xmm12 = xmm4[2],mem[2],xmm4[3],mem[3]
2438 ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
2439 ; AVX2-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
2440 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm11[4,5,6,7]
2441 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2442 ; AVX2-NEXT: vpermps %ymm8, %ymm0, %ymm11
2443 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2444 ; AVX2-NEXT: vpermps %ymm4, %ymm0, %ymm12
2445 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
2446 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
2447 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm12 # 16-byte Folded Reload
2448 ; AVX2-NEXT: # xmm12 = xmm10[2],mem[2],xmm10[3],mem[3]
2449 ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
2450 ; AVX2-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
2451 ; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm11[4,5,6,7]
2452 ; AVX2-NEXT: vmovups %ymm10, (%rsp) # 32-byte Spill
2453 ; AVX2-NEXT: vpermps %ymm15, %ymm0, %ymm11
2454 ; AVX2-NEXT: vpermps %ymm14, %ymm0, %ymm12
2455 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
2456 ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2457 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
2458 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm12 # 16-byte Folded Reload
2459 ; AVX2-NEXT: # xmm12 = xmm10[2],mem[2],xmm10[3],mem[3]
2460 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3]
2461 ; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm11[4,5,6,7]
2462 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3,7,3,7,3,7,3,7]
2463 ; AVX2-NEXT: vpermps %ymm15, %ymm0, %ymm11
2464 ; AVX2-NEXT: vpermps %ymm14, %ymm0, %ymm12
2465 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
2466 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
2467 ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
2468 ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
2469 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7]
2470 ; AVX2-NEXT: vpermps %ymm9, %ymm0, %ymm3
2471 ; AVX2-NEXT: vpermps %ymm6, %ymm0, %ymm6
2472 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
2473 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
2474 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
2475 ; AVX2-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3]
2476 ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
2477 ; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3]
2478 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
2479 ; AVX2-NEXT: vpermps %ymm8, %ymm0, %ymm6
2480 ; AVX2-NEXT: vpermps %ymm4, %ymm0, %ymm8
2481 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7]
2482 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
2483 ; AVX2-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
2484 ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
2485 ; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3]
2486 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
2487 ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
2488 ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
2489 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7]
2490 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3]
2491 ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2492 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2493 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
2494 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2495 ; AVX2-NEXT: vmovaps %ymm1, 32(%rsi)
2496 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2497 ; AVX2-NEXT: vmovaps %ymm1, 96(%rsi)
2498 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2499 ; AVX2-NEXT: vmovaps %ymm1, 64(%rsi)
2500 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2501 ; AVX2-NEXT: vmovaps %ymm1, (%rsi)
2502 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2503 ; AVX2-NEXT: vmovaps %ymm1, 96(%rdx)
2504 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2505 ; AVX2-NEXT: vmovaps %ymm1, 32(%rdx)
2506 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2507 ; AVX2-NEXT: vmovaps %ymm1, (%rdx)
2508 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2509 ; AVX2-NEXT: vmovaps %ymm1, 64(%rdx)
2510 ; AVX2-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
2511 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx)
2512 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2513 ; AVX2-NEXT: vmovaps %ymm1, 96(%rcx)
2514 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2515 ; AVX2-NEXT: vmovaps %ymm1, 64(%rcx)
2516 ; AVX2-NEXT: vmovaps %ymm13, (%rcx)
2517 ; AVX2-NEXT: vmovaps %ymm0, 96(%r8)
2518 ; AVX2-NEXT: vmovaps %ymm5, 32(%r8)
2519 ; AVX2-NEXT: vmovaps %ymm3, 64(%r8)
2520 ; AVX2-NEXT: vmovaps %ymm2, (%r8)
2521 ; AVX2-NEXT: addq $680, %rsp # imm = 0x2A8
2522 ; AVX2-NEXT: vzeroupper
2525 ; AVX2-FP-LABEL: load_i32_stride4_vf32:
2527 ; AVX2-FP-NEXT: subq $680, %rsp # imm = 0x2A8
2528 ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm12
2529 ; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm8
2530 ; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2531 ; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm4
2532 ; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm10
2533 ; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm11
2534 ; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm6
2535 ; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm9
2536 ; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm0 = [0,4,0,4,0,4,0,4]
2537 ; AVX2-FP-NEXT: vpermps %ymm9, %ymm0, %ymm1
2538 ; AVX2-FP-NEXT: vpermps %ymm6, %ymm0, %ymm2
2539 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
2540 ; AVX2-FP-NEXT: vmovaps 272(%rdi), %xmm3
2541 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2542 ; AVX2-FP-NEXT: vmovaps 256(%rdi), %xmm2
2543 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2544 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
2545 ; AVX2-FP-NEXT: vpermps %ymm11, %ymm0, %ymm3
2546 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
2547 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
2548 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2549 ; AVX2-FP-NEXT: vpermps %ymm10, %ymm0, %ymm1
2550 ; AVX2-FP-NEXT: vpermps %ymm4, %ymm0, %ymm2
2551 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
2552 ; AVX2-FP-NEXT: vmovaps 400(%rdi), %xmm3
2553 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2554 ; AVX2-FP-NEXT: vmovaps 384(%rdi), %xmm2
2555 ; AVX2-FP-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill
2556 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
2557 ; AVX2-FP-NEXT: vpermps %ymm8, %ymm0, %ymm3
2558 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
2559 ; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm8
2560 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
2561 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2562 ; AVX2-FP-NEXT: vpermps %ymm8, %ymm0, %ymm1
2563 ; AVX2-FP-NEXT: vpermps %ymm12, %ymm0, %ymm2
2564 ; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2565 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
2566 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm5
2567 ; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2568 ; AVX2-FP-NEXT: vmovaps 144(%rdi), %xmm3
2569 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2570 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %xmm2
2571 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2572 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
2573 ; AVX2-FP-NEXT: vpermps %ymm5, %ymm0, %ymm3
2574 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
2575 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
2576 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2577 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm14
2578 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm15
2579 ; AVX2-FP-NEXT: vpermps %ymm15, %ymm0, %ymm1
2580 ; AVX2-FP-NEXT: vpermps %ymm14, %ymm0, %ymm2
2581 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
2582 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm2
2583 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2584 ; AVX2-FP-NEXT: vpermps %ymm2, %ymm0, %ymm0
2585 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm3
2586 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2587 ; AVX2-FP-NEXT: vmovaps 16(%rdi), %xmm2
2588 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2589 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
2590 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
2591 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2592 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2593 ; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1,5,1,5,1,5,1,5]
2594 ; AVX2-FP-NEXT: vpermps %ymm9, %ymm0, %ymm1
2595 ; AVX2-FP-NEXT: vpermps %ymm6, %ymm0, %ymm2
2596 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
2597 ; AVX2-FP-NEXT: vmovaps 256(%rdi), %ymm5
2598 ; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2599 ; AVX2-FP-NEXT: vmovaps 304(%rdi), %xmm3
2600 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2601 ; AVX2-FP-NEXT: vmovaps 288(%rdi), %xmm2
2602 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2603 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
2604 ; AVX2-FP-NEXT: vpermps %ymm5, %ymm0, %ymm3
2605 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
2606 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
2607 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2608 ; AVX2-FP-NEXT: vpermps %ymm15, %ymm0, %ymm1
2609 ; AVX2-FP-NEXT: vpermps %ymm14, %ymm0, %ymm2
2610 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
2611 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm7
2612 ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2613 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm3
2614 ; AVX2-FP-NEXT: vmovaps 48(%rdi), %xmm2
2615 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
2616 ; AVX2-FP-NEXT: vpermps %ymm7, %ymm0, %ymm7
2617 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
2618 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
2619 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2620 ; AVX2-FP-NEXT: vpermps %ymm8, %ymm0, %ymm1
2621 ; AVX2-FP-NEXT: vpermps %ymm12, %ymm0, %ymm5
2622 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
2623 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm12
2624 ; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2625 ; AVX2-FP-NEXT: vmovaps 176(%rdi), %xmm7
2626 ; AVX2-FP-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2627 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %xmm5
2628 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
2629 ; AVX2-FP-NEXT: vpermps %ymm12, %ymm0, %ymm13
2630 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm13[0,1],xmm7[2,3]
2631 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
2632 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2633 ; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2634 ; AVX2-FP-NEXT: vpermps %ymm10, %ymm0, %ymm1
2635 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2636 ; AVX2-FP-NEXT: vpermps %ymm4, %ymm0, %ymm7
2637 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm7[0,1,2,3,4,5],ymm1[6,7]
2638 ; AVX2-FP-NEXT: vmovaps 384(%rdi), %ymm1
2639 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2640 ; AVX2-FP-NEXT: vpermps %ymm1, %ymm0, %ymm0
2641 ; AVX2-FP-NEXT: vmovaps 432(%rdi), %xmm7
2642 ; AVX2-FP-NEXT: vmovaps 416(%rdi), %xmm1
2643 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm12 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
2644 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3]
2645 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
2646 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2647 ; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,6,2,6,2,6,2,6]
2648 ; AVX2-FP-NEXT: vpermps %ymm9, %ymm0, %ymm12
2649 ; AVX2-FP-NEXT: vpermps %ymm6, %ymm0, %ymm13
2650 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
2651 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
2652 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
2653 ; AVX2-FP-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3]
2654 ; AVX2-FP-NEXT: vpermps %ymm11, %ymm0, %ymm11
2655 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3]
2656 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
2657 ; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2658 ; AVX2-FP-NEXT: vpermps %ymm10, %ymm0, %ymm11
2659 ; AVX2-FP-NEXT: vpermps %ymm4, %ymm0, %ymm12
2660 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
2661 ; AVX2-FP-NEXT: vmovaps (%rsp), %xmm4 # 16-byte Reload
2662 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm12 # 16-byte Folded Reload
2663 ; AVX2-FP-NEXT: # xmm12 = xmm4[2],mem[2],xmm4[3],mem[3]
2664 ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
2665 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
2666 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm11[4,5,6,7]
2667 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2668 ; AVX2-FP-NEXT: vpermps %ymm8, %ymm0, %ymm11
2669 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2670 ; AVX2-FP-NEXT: vpermps %ymm4, %ymm0, %ymm12
2671 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
2672 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
2673 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm12 # 16-byte Folded Reload
2674 ; AVX2-FP-NEXT: # xmm12 = xmm10[2],mem[2],xmm10[3],mem[3]
2675 ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
2676 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
2677 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm11[4,5,6,7]
2678 ; AVX2-FP-NEXT: vmovups %ymm10, (%rsp) # 32-byte Spill
2679 ; AVX2-FP-NEXT: vpermps %ymm15, %ymm0, %ymm11
2680 ; AVX2-FP-NEXT: vpermps %ymm14, %ymm0, %ymm12
2681 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
2682 ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2683 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
2684 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm12 # 16-byte Folded Reload
2685 ; AVX2-FP-NEXT: # xmm12 = xmm10[2],mem[2],xmm10[3],mem[3]
2686 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3]
2687 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm11[4,5,6,7]
2688 ; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3,7,3,7,3,7,3,7]
2689 ; AVX2-FP-NEXT: vpermps %ymm15, %ymm0, %ymm11
2690 ; AVX2-FP-NEXT: vpermps %ymm14, %ymm0, %ymm12
2691 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
2692 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
2693 ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
2694 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
2695 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7]
2696 ; AVX2-FP-NEXT: vpermps %ymm9, %ymm0, %ymm3
2697 ; AVX2-FP-NEXT: vpermps %ymm6, %ymm0, %ymm6
2698 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
2699 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
2700 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
2701 ; AVX2-FP-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3]
2702 ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
2703 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3]
2704 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
2705 ; AVX2-FP-NEXT: vpermps %ymm8, %ymm0, %ymm6
2706 ; AVX2-FP-NEXT: vpermps %ymm4, %ymm0, %ymm8
2707 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7]
2708 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
2709 ; AVX2-FP-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
2710 ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
2711 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3]
2712 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
2713 ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
2714 ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
2715 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7]
2716 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3]
2717 ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2718 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2719 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
2720 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2721 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rsi)
2722 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2723 ; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rsi)
2724 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2725 ; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rsi)
2726 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2727 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rsi)
2728 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2729 ; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rdx)
2730 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2731 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rdx)
2732 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2733 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rdx)
2734 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2735 ; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rdx)
2736 ; AVX2-FP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
2737 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rcx)
2738 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2739 ; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rcx)
2740 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2741 ; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rcx)
2742 ; AVX2-FP-NEXT: vmovaps %ymm13, (%rcx)
2743 ; AVX2-FP-NEXT: vmovaps %ymm0, 96(%r8)
2744 ; AVX2-FP-NEXT: vmovaps %ymm5, 32(%r8)
2745 ; AVX2-FP-NEXT: vmovaps %ymm3, 64(%r8)
2746 ; AVX2-FP-NEXT: vmovaps %ymm2, (%r8)
2747 ; AVX2-FP-NEXT: addq $680, %rsp # imm = 0x2A8
2748 ; AVX2-FP-NEXT: vzeroupper
2749 ; AVX2-FP-NEXT: retq
2751 ; AVX2-FCP-LABEL: load_i32_stride4_vf32:
2752 ; AVX2-FCP: # %bb.0:
2753 ; AVX2-FCP-NEXT: subq $680, %rsp # imm = 0x2A8
2754 ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm12
2755 ; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm8
2756 ; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2757 ; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm4
2758 ; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm10
2759 ; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm11
2760 ; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm6
2761 ; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm9
2762 ; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm0 = [0,4,0,4,0,4,0,4]
2763 ; AVX2-FCP-NEXT: vpermps %ymm9, %ymm0, %ymm1
2764 ; AVX2-FCP-NEXT: vpermps %ymm6, %ymm0, %ymm2
2765 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
2766 ; AVX2-FCP-NEXT: vmovaps 272(%rdi), %xmm3
2767 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2768 ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %xmm2
2769 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2770 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
2771 ; AVX2-FCP-NEXT: vpermps %ymm11, %ymm0, %ymm3
2772 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
2773 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
2774 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2775 ; AVX2-FCP-NEXT: vpermps %ymm10, %ymm0, %ymm1
2776 ; AVX2-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm2
2777 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
2778 ; AVX2-FCP-NEXT: vmovaps 400(%rdi), %xmm3
2779 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2780 ; AVX2-FCP-NEXT: vmovaps 384(%rdi), %xmm2
2781 ; AVX2-FCP-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill
2782 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
2783 ; AVX2-FCP-NEXT: vpermps %ymm8, %ymm0, %ymm3
2784 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
2785 ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm8
2786 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
2787 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2788 ; AVX2-FCP-NEXT: vpermps %ymm8, %ymm0, %ymm1
2789 ; AVX2-FCP-NEXT: vpermps %ymm12, %ymm0, %ymm2
2790 ; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2791 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
2792 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm5
2793 ; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2794 ; AVX2-FCP-NEXT: vmovaps 144(%rdi), %xmm3
2795 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2796 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %xmm2
2797 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2798 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
2799 ; AVX2-FCP-NEXT: vpermps %ymm5, %ymm0, %ymm3
2800 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
2801 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
2802 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2803 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm14
2804 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm15
2805 ; AVX2-FCP-NEXT: vpermps %ymm15, %ymm0, %ymm1
2806 ; AVX2-FCP-NEXT: vpermps %ymm14, %ymm0, %ymm2
2807 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
2808 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm2
2809 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2810 ; AVX2-FCP-NEXT: vpermps %ymm2, %ymm0, %ymm0
2811 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm3
2812 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2813 ; AVX2-FCP-NEXT: vmovaps 16(%rdi), %xmm2
2814 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2815 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
2816 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
2817 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2818 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2819 ; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1,5,1,5,1,5,1,5]
2820 ; AVX2-FCP-NEXT: vpermps %ymm9, %ymm0, %ymm1
2821 ; AVX2-FCP-NEXT: vpermps %ymm6, %ymm0, %ymm2
2822 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
2823 ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm5
2824 ; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2825 ; AVX2-FCP-NEXT: vmovaps 304(%rdi), %xmm3
2826 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2827 ; AVX2-FCP-NEXT: vmovaps 288(%rdi), %xmm2
2828 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2829 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
2830 ; AVX2-FCP-NEXT: vpermps %ymm5, %ymm0, %ymm3
2831 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
2832 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
2833 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2834 ; AVX2-FCP-NEXT: vpermps %ymm15, %ymm0, %ymm1
2835 ; AVX2-FCP-NEXT: vpermps %ymm14, %ymm0, %ymm2
2836 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
2837 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm7
2838 ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2839 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm3
2840 ; AVX2-FCP-NEXT: vmovaps 48(%rdi), %xmm2
2841 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
2842 ; AVX2-FCP-NEXT: vpermps %ymm7, %ymm0, %ymm7
2843 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
2844 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
2845 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2846 ; AVX2-FCP-NEXT: vpermps %ymm8, %ymm0, %ymm1
2847 ; AVX2-FCP-NEXT: vpermps %ymm12, %ymm0, %ymm5
2848 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
2849 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm12
2850 ; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2851 ; AVX2-FCP-NEXT: vmovaps 176(%rdi), %xmm7
2852 ; AVX2-FCP-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2853 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %xmm5
2854 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
2855 ; AVX2-FCP-NEXT: vpermps %ymm12, %ymm0, %ymm13
2856 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm7 = xmm13[0,1],xmm7[2,3]
2857 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
2858 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2859 ; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2860 ; AVX2-FCP-NEXT: vpermps %ymm10, %ymm0, %ymm1
2861 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2862 ; AVX2-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm7
2863 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm7[0,1,2,3,4,5],ymm1[6,7]
2864 ; AVX2-FCP-NEXT: vmovaps 384(%rdi), %ymm1
2865 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2866 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0
2867 ; AVX2-FCP-NEXT: vmovaps 432(%rdi), %xmm7
2868 ; AVX2-FCP-NEXT: vmovaps 416(%rdi), %xmm1
2869 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm12 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
2870 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3]
2871 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
2872 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2873 ; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,6,2,6,2,6,2,6]
2874 ; AVX2-FCP-NEXT: vpermps %ymm9, %ymm0, %ymm12
2875 ; AVX2-FCP-NEXT: vpermps %ymm6, %ymm0, %ymm13
2876 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
2877 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
2878 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
2879 ; AVX2-FCP-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3]
2880 ; AVX2-FCP-NEXT: vpermps %ymm11, %ymm0, %ymm11
2881 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3]
2882 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
2883 ; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2884 ; AVX2-FCP-NEXT: vpermps %ymm10, %ymm0, %ymm11
2885 ; AVX2-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm12
2886 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
2887 ; AVX2-FCP-NEXT: vmovaps (%rsp), %xmm4 # 16-byte Reload
2888 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm12 # 16-byte Folded Reload
2889 ; AVX2-FCP-NEXT: # xmm12 = xmm4[2],mem[2],xmm4[3],mem[3]
2890 ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
2891 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
2892 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm11[4,5,6,7]
2893 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2894 ; AVX2-FCP-NEXT: vpermps %ymm8, %ymm0, %ymm11
2895 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2896 ; AVX2-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm12
2897 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
2898 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
2899 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm12 # 16-byte Folded Reload
2900 ; AVX2-FCP-NEXT: # xmm12 = xmm10[2],mem[2],xmm10[3],mem[3]
2901 ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
2902 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
2903 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm11[4,5,6,7]
2904 ; AVX2-FCP-NEXT: vmovups %ymm10, (%rsp) # 32-byte Spill
2905 ; AVX2-FCP-NEXT: vpermps %ymm15, %ymm0, %ymm11
2906 ; AVX2-FCP-NEXT: vpermps %ymm14, %ymm0, %ymm12
2907 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
2908 ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2909 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
2910 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm12 # 16-byte Folded Reload
2911 ; AVX2-FCP-NEXT: # xmm12 = xmm10[2],mem[2],xmm10[3],mem[3]
2912 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3]
2913 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm11[4,5,6,7]
2914 ; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3,7,3,7,3,7,3,7]
2915 ; AVX2-FCP-NEXT: vpermps %ymm15, %ymm0, %ymm11
2916 ; AVX2-FCP-NEXT: vpermps %ymm14, %ymm0, %ymm12
2917 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
2918 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
2919 ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
2920 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
2921 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7]
2922 ; AVX2-FCP-NEXT: vpermps %ymm9, %ymm0, %ymm3
2923 ; AVX2-FCP-NEXT: vpermps %ymm6, %ymm0, %ymm6
2924 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
2925 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
2926 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
2927 ; AVX2-FCP-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3]
2928 ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
2929 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3]
2930 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
2931 ; AVX2-FCP-NEXT: vpermps %ymm8, %ymm0, %ymm6
2932 ; AVX2-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm8
2933 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7]
2934 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
2935 ; AVX2-FCP-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
2936 ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
2937 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3]
2938 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
2939 ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
2940 ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
2941 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7]
2942 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3]
2943 ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2944 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2945 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
2946 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2947 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rsi)
2948 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2949 ; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rsi)
2950 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2951 ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rsi)
2952 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2953 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rsi)
2954 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2955 ; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rdx)
2956 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2957 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rdx)
2958 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2959 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rdx)
2960 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2961 ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rdx)
2962 ; AVX2-FCP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
2963 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rcx)
2964 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2965 ; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rcx)
2966 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2967 ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rcx)
2968 ; AVX2-FCP-NEXT: vmovaps %ymm13, (%rcx)
2969 ; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%r8)
2970 ; AVX2-FCP-NEXT: vmovaps %ymm5, 32(%r8)
2971 ; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%r8)
2972 ; AVX2-FCP-NEXT: vmovaps %ymm2, (%r8)
2973 ; AVX2-FCP-NEXT: addq $680, %rsp # imm = 0x2A8
2974 ; AVX2-FCP-NEXT: vzeroupper
2975 ; AVX2-FCP-NEXT: retq
2977 ; AVX512-LABEL: load_i32_stride4_vf32:
2979 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
2980 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1
2981 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2
2982 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3
2983 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm4
2984 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm5
2985 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm6
2986 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm7
2987 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28]
2988 ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
2989 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm9
2990 ; AVX512-NEXT: vpermt2d %zmm6, %zmm8, %zmm9
2991 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm10
2992 ; AVX512-NEXT: vpermt2d %zmm4, %zmm8, %zmm10
2993 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7]
2994 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10
2995 ; AVX512-NEXT: vpermt2d %zmm3, %zmm8, %zmm10
2996 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm8
2997 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7]
2998 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29]
2999 ; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
3000 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm11
3001 ; AVX512-NEXT: vpermt2d %zmm6, %zmm10, %zmm11
3002 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm12
3003 ; AVX512-NEXT: vpermt2d %zmm4, %zmm10, %zmm12
3004 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7]
3005 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm12
3006 ; AVX512-NEXT: vpermt2d %zmm3, %zmm10, %zmm12
3007 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm10
3008 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7]
3009 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30]
3010 ; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
3011 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm13
3012 ; AVX512-NEXT: vpermt2d %zmm6, %zmm12, %zmm13
3013 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm14
3014 ; AVX512-NEXT: vpermt2d %zmm4, %zmm12, %zmm14
3015 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7]
3016 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm14
3017 ; AVX512-NEXT: vpermt2d %zmm3, %zmm12, %zmm14
3018 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm12
3019 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7]
3020 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31]
3021 ; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
3022 ; AVX512-NEXT: vpermt2d %zmm6, %zmm14, %zmm7
3023 ; AVX512-NEXT: vpermt2d %zmm4, %zmm14, %zmm5
3024 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7]
3025 ; AVX512-NEXT: vpermt2d %zmm3, %zmm14, %zmm2
3026 ; AVX512-NEXT: vpermt2d %zmm1, %zmm14, %zmm0
3027 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
3028 ; AVX512-NEXT: vmovdqa64 %zmm9, 64(%rsi)
3029 ; AVX512-NEXT: vmovdqa64 %zmm8, (%rsi)
3030 ; AVX512-NEXT: vmovdqa64 %zmm11, 64(%rdx)
3031 ; AVX512-NEXT: vmovdqa64 %zmm10, (%rdx)
3032 ; AVX512-NEXT: vmovdqa64 %zmm13, 64(%rcx)
3033 ; AVX512-NEXT: vmovdqa64 %zmm12, (%rcx)
3034 ; AVX512-NEXT: vmovdqa64 %zmm4, 64(%r8)
3035 ; AVX512-NEXT: vmovdqa64 %zmm0, (%r8)
3036 ; AVX512-NEXT: vzeroupper
3039 ; AVX512-FCP-LABEL: load_i32_stride4_vf32:
3040 ; AVX512-FCP: # %bb.0:
3041 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
3042 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
3043 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
3044 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
3045 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4
3046 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5
3047 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6
3048 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7
3049 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28]
3050 ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
3051 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9
3052 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm9
3053 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm10
3054 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm10
3055 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7]
3056 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm10
3057 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm10
3058 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8
3059 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7]
3060 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29]
3061 ; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
3062 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm11
3063 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm10, %zmm11
3064 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm12
3065 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm12
3066 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7]
3067 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm12
3068 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm10, %zmm12
3069 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10
3070 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7]
3071 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30]
3072 ; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
3073 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm13
3074 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm13
3075 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm14
3076 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm14
3077 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7]
3078 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm14
3079 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm12, %zmm14
3080 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm12
3081 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7]
3082 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31]
3083 ; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
3084 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm14, %zmm7
3085 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm5
3086 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7]
3087 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm2
3088 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm0
3089 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
3090 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 64(%rsi)
3091 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%rsi)
3092 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 64(%rdx)
3093 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rdx)
3094 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx)
3095 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, (%rcx)
3096 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8)
3097 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
3098 ; AVX512-FCP-NEXT: vzeroupper
3099 ; AVX512-FCP-NEXT: retq
3101 ; AVX512DQ-LABEL: load_i32_stride4_vf32:
3102 ; AVX512DQ: # %bb.0:
3103 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0
3104 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1
3105 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2
3106 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3
3107 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm4
3108 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm5
3109 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm6
3110 ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm7
3111 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28]
3112 ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
3113 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm9
3114 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm8, %zmm9
3115 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm10
3116 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm8, %zmm10
3117 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7]
3118 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm10
3119 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm8, %zmm10
3120 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm8
3121 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7]
3122 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29]
3123 ; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
3124 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm11
3125 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm10, %zmm11
3126 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm12
3127 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm10, %zmm12
3128 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7]
3129 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm12
3130 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm10, %zmm12
3131 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm10
3132 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7]
3133 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30]
3134 ; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
3135 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm13
3136 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm12, %zmm13
3137 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm14
3138 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm12, %zmm14
3139 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7]
3140 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm14
3141 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm12, %zmm14
3142 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm12
3143 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7]
3144 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31]
3145 ; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
3146 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm14, %zmm7
3147 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm14, %zmm5
3148 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7]
3149 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm14, %zmm2
3150 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm14, %zmm0
3151 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
3152 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 64(%rsi)
3153 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rsi)
3154 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 64(%rdx)
3155 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rdx)
3156 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 64(%rcx)
3157 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, (%rcx)
3158 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%r8)
3159 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r8)
3160 ; AVX512DQ-NEXT: vzeroupper
3161 ; AVX512DQ-NEXT: retq
3163 ; AVX512DQ-FCP-LABEL: load_i32_stride4_vf32:
3164 ; AVX512DQ-FCP: # %bb.0:
3165 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
3166 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
3167 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
3168 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
3169 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4
3170 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5
3171 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6
3172 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7
3173 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28]
3174 ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
3175 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm9
3176 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm9
3177 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm10
3178 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm10
3179 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7]
3180 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm10
3181 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm10
3182 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8
3183 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7]
3184 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29]
3185 ; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
3186 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm11
3187 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm10, %zmm11
3188 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm12
3189 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm12
3190 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7]
3191 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm12
3192 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm10, %zmm12
3193 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10
3194 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7]
3195 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30]
3196 ; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
3197 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm13
3198 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm13
3199 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm14
3200 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm14
3201 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7]
3202 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm14
3203 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm12, %zmm14
3204 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm12
3205 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7]
3206 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31]
3207 ; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
3208 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm14, %zmm7
3209 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm5
3210 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7]
3211 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm2
3212 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm0
3213 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
3214 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 64(%rsi)
3215 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rsi)
3216 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 64(%rdx)
3217 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rdx)
3218 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx)
3219 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, (%rcx)
3220 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8)
3221 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
3222 ; AVX512DQ-FCP-NEXT: vzeroupper
3223 ; AVX512DQ-FCP-NEXT: retq
3225 ; AVX512BW-LABEL: load_i32_stride4_vf32:
3226 ; AVX512BW: # %bb.0:
3227 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
3228 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
3229 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2
3230 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3
3231 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm4
3232 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm5
3233 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm6
3234 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm7
3235 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28]
3236 ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
3237 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9
3238 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm9
3239 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10
3240 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm10
3241 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7]
3242 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10
3243 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm8, %zmm10
3244 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8
3245 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7]
3246 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29]
3247 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
3248 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11
3249 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm10, %zmm11
3250 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12
3251 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm12
3252 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7]
3253 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12
3254 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm10, %zmm12
3255 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10
3256 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7]
3257 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30]
3258 ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
3259 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13
3260 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm12, %zmm13
3261 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14
3262 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm12, %zmm14
3263 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7]
3264 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm14
3265 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm12, %zmm14
3266 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm12
3267 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7]
3268 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31]
3269 ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
3270 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm14, %zmm7
3271 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm14, %zmm5
3272 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7]
3273 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm14, %zmm2
3274 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm14, %zmm0
3275 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
3276 ; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rsi)
3277 ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi)
3278 ; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rdx)
3279 ; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rdx)
3280 ; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rcx)
3281 ; AVX512BW-NEXT: vmovdqa64 %zmm12, (%rcx)
3282 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%r8)
3283 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r8)
3284 ; AVX512BW-NEXT: vzeroupper
3285 ; AVX512BW-NEXT: retq
3287 ; AVX512BW-FCP-LABEL: load_i32_stride4_vf32:
3288 ; AVX512BW-FCP: # %bb.0:
3289 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
3290 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
3291 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
3292 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
3293 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4
3294 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5
3295 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6
3296 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7
3297 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28]
3298 ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
3299 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9
3300 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm9
3301 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10
3302 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm10
3303 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7]
3304 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10
3305 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm10
3306 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8
3307 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7]
3308 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29]
3309 ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
3310 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11
3311 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm10, %zmm11
3312 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12
3313 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm12
3314 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7]
3315 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12
3316 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm10, %zmm12
3317 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10
3318 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7]
3319 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30]
3320 ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
3321 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13
3322 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm13
3323 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14
3324 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm14
3325 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7]
3326 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm14
3327 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm12, %zmm14
3328 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm12
3329 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7]
3330 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31]
3331 ; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
3332 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm14, %zmm7
3333 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm5
3334 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7]
3335 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm2
3336 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm0
3337 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
3338 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%rsi)
3339 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi)
3340 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 64(%rdx)
3341 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdx)
3342 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx)
3343 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%rcx)
3344 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8)
3345 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
3346 ; AVX512BW-FCP-NEXT: vzeroupper
3347 ; AVX512BW-FCP-NEXT: retq
3349 ; AVX512DQ-BW-LABEL: load_i32_stride4_vf32:
3350 ; AVX512DQ-BW: # %bb.0:
3351 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
3352 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1
3353 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2
3354 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3
3355 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm4
3356 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm5
3357 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm6
3358 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm7
3359 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28]
3360 ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
3361 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9
3362 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm9
3363 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm10
3364 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm10
3365 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7]
3366 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm10
3367 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm8, %zmm10
3368 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8
3369 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7]
3370 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29]
3371 ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
3372 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm11
3373 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm10, %zmm11
3374 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm12
3375 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm12
3376 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7]
3377 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12
3378 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm10, %zmm12
3379 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10
3380 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7]
3381 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30]
3382 ; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
3383 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm13
3384 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm12, %zmm13
3385 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm14
3386 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm12, %zmm14
3387 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7]
3388 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm14
3389 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm12, %zmm14
3390 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm12
3391 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7]
3392 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31]
3393 ; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
3394 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm14, %zmm7
3395 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm14, %zmm5
3396 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7]
3397 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm14, %zmm2
3398 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm14, %zmm0
3399 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
3400 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 64(%rsi)
3401 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rsi)
3402 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 64(%rdx)
3403 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rdx)
3404 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 64(%rcx)
3405 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, (%rcx)
3406 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 64(%r8)
3407 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%r8)
3408 ; AVX512DQ-BW-NEXT: vzeroupper
3409 ; AVX512DQ-BW-NEXT: retq
3411 ; AVX512DQ-BW-FCP-LABEL: load_i32_stride4_vf32:
3412 ; AVX512DQ-BW-FCP: # %bb.0:
3413 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
3414 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
3415 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
3416 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
3417 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4
3418 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5
3419 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6
3420 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7
3421 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28]
3422 ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
3423 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9
3424 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm9
3425 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10
3426 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm10
3427 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7]
3428 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10
3429 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm10
3430 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8
3431 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7]
3432 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29]
3433 ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
3434 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11
3435 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm10, %zmm11
3436 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12
3437 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm12
3438 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7]
3439 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12
3440 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm10, %zmm12
3441 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10
3442 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7]
3443 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30]
3444 ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
3445 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13
3446 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm13
3447 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14
3448 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm14
3449 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7]
3450 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm14
3451 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm12, %zmm14
3452 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm12
3453 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7]
3454 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31]
3455 ; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
3456 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm14, %zmm7
3457 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm5
3458 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7]
3459 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm2
3460 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm0
3461 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
3462 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%rsi)
3463 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi)
3464 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 64(%rdx)
3465 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdx)
3466 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx)
3467 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%rcx)
3468 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8)
3469 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
3470 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
3471 ; AVX512DQ-BW-FCP-NEXT: retq
3472 %wide.vec = load <128 x i32>, ptr %in.vec, align 64
3473 %strided.vec0 = shufflevector <128 x i32> %wide.vec, <128 x i32> poison, <32 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60, i32 64, i32 68, i32 72, i32 76, i32 80, i32 84, i32 88, i32 92, i32 96, i32 100, i32 104, i32 108, i32 112, i32 116, i32 120, i32 124>
3474 %strided.vec1 = shufflevector <128 x i32> %wide.vec, <128 x i32> poison, <32 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61, i32 65, i32 69, i32 73, i32 77, i32 81, i32 85, i32 89, i32 93, i32 97, i32 101, i32 105, i32 109, i32 113, i32 117, i32 121, i32 125>
3475 %strided.vec2 = shufflevector <128 x i32> %wide.vec, <128 x i32> poison, <32 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62, i32 66, i32 70, i32 74, i32 78, i32 82, i32 86, i32 90, i32 94, i32 98, i32 102, i32 106, i32 110, i32 114, i32 118, i32 122, i32 126>
3476 %strided.vec3 = shufflevector <128 x i32> %wide.vec, <128 x i32> poison, <32 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63, i32 67, i32 71, i32 75, i32 79, i32 83, i32 87, i32 91, i32 95, i32 99, i32 103, i32 107, i32 111, i32 115, i32 119, i32 123, i32 127>
3477 store <32 x i32> %strided.vec0, ptr %out.vec0, align 64
3478 store <32 x i32> %strided.vec1, ptr %out.vec1, align 64
3479 store <32 x i32> %strided.vec2, ptr %out.vec2, align 64
3480 store <32 x i32> %strided.vec3, ptr %out.vec3, align 64
3484 define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind {
3485 ; SSE-LABEL: load_i32_stride4_vf64:
3487 ; SSE-NEXT: subq $1224, %rsp # imm = 0x4C8
3488 ; SSE-NEXT: movaps 144(%rdi), %xmm14
3489 ; SSE-NEXT: movaps 176(%rdi), %xmm11
3490 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3491 ; SSE-NEXT: movaps 160(%rdi), %xmm5
3492 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3493 ; SSE-NEXT: movaps 208(%rdi), %xmm3
3494 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3495 ; SSE-NEXT: movaps 192(%rdi), %xmm8
3496 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3497 ; SSE-NEXT: movaps 240(%rdi), %xmm6
3498 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3499 ; SSE-NEXT: movaps 224(%rdi), %xmm7
3500 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3501 ; SSE-NEXT: movaps 80(%rdi), %xmm10
3502 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3503 ; SSE-NEXT: movaps 64(%rdi), %xmm4
3504 ; SSE-NEXT: movaps 112(%rdi), %xmm2
3505 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3506 ; SSE-NEXT: movaps 96(%rdi), %xmm0
3507 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3508 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
3509 ; SSE-NEXT: movaps %xmm4, %xmm2
3510 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1]
3511 ; SSE-NEXT: movaps %xmm2, %xmm1
3512 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
3513 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3514 ; SSE-NEXT: movaps %xmm7, %xmm1
3515 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
3516 ; SSE-NEXT: movaps %xmm8, %xmm6
3517 ; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
3518 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
3519 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3520 ; SSE-NEXT: movaps %xmm6, %xmm0
3521 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3522 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3523 ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1]
3524 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3525 ; SSE-NEXT: movaps %xmm5, %xmm0
3526 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
3527 ; SSE-NEXT: movaps 128(%rdi), %xmm2
3528 ; SSE-NEXT: movaps %xmm2, %xmm1
3529 ; SSE-NEXT: movaps %xmm2, %xmm10
3530 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1]
3531 ; SSE-NEXT: movaps %xmm1, %xmm2
3532 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
3533 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3534 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
3535 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3536 ; SSE-NEXT: movaps 368(%rdi), %xmm12
3537 ; SSE-NEXT: movaps 352(%rdi), %xmm0
3538 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3539 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1]
3540 ; SSE-NEXT: movaps 336(%rdi), %xmm13
3541 ; SSE-NEXT: movaps 320(%rdi), %xmm11
3542 ; SSE-NEXT: movaps %xmm11, %xmm1
3543 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1]
3544 ; SSE-NEXT: movaps %xmm1, %xmm2
3545 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
3546 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3547 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
3548 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3549 ; SSE-NEXT: movaps 304(%rdi), %xmm2
3550 ; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill
3551 ; SSE-NEXT: movaps 288(%rdi), %xmm8
3552 ; SSE-NEXT: movaps %xmm8, %xmm0
3553 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
3554 ; SSE-NEXT: movaps 272(%rdi), %xmm3
3555 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3556 ; SSE-NEXT: movaps 256(%rdi), %xmm2
3557 ; SSE-NEXT: movaps %xmm2, %xmm1
3558 ; SSE-NEXT: movaps %xmm2, %xmm9
3559 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
3560 ; SSE-NEXT: movaps %xmm1, %xmm2
3561 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
3562 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3563 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
3564 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3565 ; SSE-NEXT: movaps 496(%rdi), %xmm1
3566 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3567 ; SSE-NEXT: movaps 480(%rdi), %xmm0
3568 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3569 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3570 ; SSE-NEXT: movaps 464(%rdi), %xmm2
3571 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3572 ; SSE-NEXT: movaps 448(%rdi), %xmm1
3573 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3574 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3575 ; SSE-NEXT: movaps %xmm1, %xmm2
3576 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
3577 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3578 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
3579 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3580 ; SSE-NEXT: movaps 432(%rdi), %xmm2
3581 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3582 ; SSE-NEXT: movaps 416(%rdi), %xmm5
3583 ; SSE-NEXT: movaps %xmm5, %xmm0
3584 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
3585 ; SSE-NEXT: movaps 400(%rdi), %xmm2
3586 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3587 ; SSE-NEXT: movaps 384(%rdi), %xmm1
3588 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3589 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3590 ; SSE-NEXT: movaps %xmm1, %xmm2
3591 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
3592 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3593 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
3594 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3595 ; SSE-NEXT: movaps 624(%rdi), %xmm2
3596 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3597 ; SSE-NEXT: movaps 608(%rdi), %xmm7
3598 ; SSE-NEXT: movaps %xmm7, %xmm0
3599 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
3600 ; SSE-NEXT: movaps 592(%rdi), %xmm2
3601 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3602 ; SSE-NEXT: movaps 576(%rdi), %xmm1
3603 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3604 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3605 ; SSE-NEXT: movaps %xmm1, %xmm2
3606 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
3607 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3608 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
3609 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3610 ; SSE-NEXT: movaps 560(%rdi), %xmm1
3611 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3612 ; SSE-NEXT: movaps 544(%rdi), %xmm0
3613 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3614 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3615 ; SSE-NEXT: movaps 528(%rdi), %xmm2
3616 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3617 ; SSE-NEXT: movaps 512(%rdi), %xmm1
3618 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3619 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3620 ; SSE-NEXT: movaps %xmm1, %xmm2
3621 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
3622 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3623 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
3624 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3625 ; SSE-NEXT: movaps 752(%rdi), %xmm1
3626 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3627 ; SSE-NEXT: movaps 736(%rdi), %xmm0
3628 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3629 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3630 ; SSE-NEXT: movaps 720(%rdi), %xmm2
3631 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3632 ; SSE-NEXT: movaps 704(%rdi), %xmm1
3633 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3634 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3635 ; SSE-NEXT: movaps %xmm1, %xmm2
3636 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
3637 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3638 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
3639 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3640 ; SSE-NEXT: movaps 688(%rdi), %xmm1
3641 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3642 ; SSE-NEXT: movaps 672(%rdi), %xmm0
3643 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3644 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3645 ; SSE-NEXT: movaps 656(%rdi), %xmm3
3646 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3647 ; SSE-NEXT: movaps 640(%rdi), %xmm2
3648 ; SSE-NEXT: movaps %xmm2, %xmm1
3649 ; SSE-NEXT: movaps %xmm2, %xmm15
3650 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
3651 ; SSE-NEXT: movaps %xmm1, %xmm2
3652 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
3653 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3654 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
3655 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3656 ; SSE-NEXT: movaps 880(%rdi), %xmm1
3657 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3658 ; SSE-NEXT: movaps 864(%rdi), %xmm0
3659 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3660 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3661 ; SSE-NEXT: movaps 848(%rdi), %xmm2
3662 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3663 ; SSE-NEXT: movaps 832(%rdi), %xmm1
3664 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3665 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3666 ; SSE-NEXT: movaps %xmm1, %xmm2
3667 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
3668 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3669 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
3670 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3671 ; SSE-NEXT: movaps 816(%rdi), %xmm1
3672 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3673 ; SSE-NEXT: movaps 800(%rdi), %xmm0
3674 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3675 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3676 ; SSE-NEXT: movaps 784(%rdi), %xmm2
3677 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3678 ; SSE-NEXT: movaps 768(%rdi), %xmm1
3679 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3680 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3681 ; SSE-NEXT: movaps %xmm1, %xmm2
3682 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
3683 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3684 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
3685 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3686 ; SSE-NEXT: movaps 1008(%rdi), %xmm1
3687 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3688 ; SSE-NEXT: movaps 992(%rdi), %xmm0
3689 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3690 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3691 ; SSE-NEXT: movaps 976(%rdi), %xmm3
3692 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3693 ; SSE-NEXT: movaps 960(%rdi), %xmm2
3694 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3695 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
3696 ; SSE-NEXT: movaps %xmm2, %xmm1
3697 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
3698 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3699 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
3700 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3701 ; SSE-NEXT: movaps 944(%rdi), %xmm0
3702 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3703 ; SSE-NEXT: movaps 928(%rdi), %xmm1
3704 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3705 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3706 ; SSE-NEXT: movaps 912(%rdi), %xmm2
3707 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3708 ; SSE-NEXT: movaps 896(%rdi), %xmm0
3709 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3710 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
3711 ; SSE-NEXT: movaps %xmm0, %xmm2
3712 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
3713 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3714 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
3715 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3716 ; SSE-NEXT: movaps 32(%rdi), %xmm2
3717 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3718 ; SSE-NEXT: movaps 48(%rdi), %xmm0
3719 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3720 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3721 ; SSE-NEXT: movaps (%rdi), %xmm0
3722 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3723 ; SSE-NEXT: movaps 16(%rdi), %xmm6
3724 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
3725 ; SSE-NEXT: movaps %xmm0, %xmm1
3726 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
3727 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3728 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
3729 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3730 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3731 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3732 ; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
3733 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
3734 ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
3735 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3736 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3737 ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
3738 ; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm14[2],xmm10[3],xmm14[3]
3739 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3740 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3741 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
3742 ; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3]
3743 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3744 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
3745 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
3746 ; SSE-NEXT: unpckhps (%rsp), %xmm8 # 16-byte Folded Reload
3747 ; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3]
3748 ; SSE-NEXT: movaps %xmm8, (%rsp) # 16-byte Spill
3749 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
3750 ; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3]
3751 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3752 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3753 ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm12[2],xmm1[3],xmm12[3]
3754 ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm13[2],xmm11[3],xmm13[3]
3755 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3756 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
3757 ; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
3758 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3759 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3760 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
3761 ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3]
3762 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3763 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
3764 ; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3]
3765 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3766 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
3767 ; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3]
3768 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
3769 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
3770 ; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3]
3771 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
3772 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
3773 ; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3]
3774 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
3775 ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3]
3776 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3777 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3778 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
3779 ; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
3780 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3781 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
3782 ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3]
3783 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3784 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
3785 ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3]
3786 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3787 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3788 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
3789 ; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3]
3790 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3791 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
3792 ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3]
3793 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
3794 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
3795 ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3]
3796 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3797 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
3798 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
3799 ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3]
3800 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3801 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
3802 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
3803 ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3]
3804 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3805 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
3806 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
3807 ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3]
3808 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3809 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
3810 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
3811 ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3]
3812 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3813 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
3814 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
3815 ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3]
3816 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3817 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
3818 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
3819 ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3]
3820 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3821 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
3822 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
3823 ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3]
3824 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3825 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
3826 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
3827 ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3]
3828 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3829 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
3830 ; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm6[2],xmm15[3],xmm6[3]
3831 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3832 ; SSE-NEXT: movaps %xmm4, %xmm6
3833 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0]
3834 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3835 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
3836 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3837 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3838 ; SSE-NEXT: movaps %xmm3, %xmm4
3839 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0]
3840 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3841 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
3842 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3843 ; SSE-NEXT: movaps %xmm2, %xmm3
3844 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm14[0]
3845 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3846 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm14[1]
3847 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3848 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3849 ; SSE-NEXT: movaps %xmm0, %xmm3
3850 ; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload
3851 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
3852 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3853 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
3854 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3855 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3856 ; SSE-NEXT: movaps %xmm0, %xmm2
3857 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
3858 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3859 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
3860 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3861 ; SSE-NEXT: movaps %xmm12, %xmm15
3862 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3863 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm1[0]
3864 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm1[1]
3865 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3866 ; SSE-NEXT: movaps %xmm10, %xmm0
3867 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm13[0]
3868 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1]
3869 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3870 ; SSE-NEXT: movaps %xmm9, %xmm6
3871 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm11[0]
3872 ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm11[1]
3873 ; SSE-NEXT: movaps %xmm9, %xmm11
3874 ; SSE-NEXT: movaps %xmm5, %xmm0
3875 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3876 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0]
3877 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
3878 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3879 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3880 ; SSE-NEXT: movaps %xmm0, %xmm4
3881 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3882 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0]
3883 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
3884 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3885 ; SSE-NEXT: movaps %xmm7, %xmm3
3886 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm8[0]
3887 ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm8[1]
3888 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3889 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3890 ; SSE-NEXT: movaps %xmm0, %xmm2
3891 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3892 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
3893 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
3894 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3895 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3896 ; SSE-NEXT: movaps %xmm0, %xmm1
3897 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3898 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0]
3899 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1]
3900 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3901 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3902 ; SSE-NEXT: movaps %xmm0, %xmm12
3903 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3904 ; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm7[0]
3905 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1]
3906 ; SSE-NEXT: movaps %xmm0, %xmm7
3907 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3908 ; SSE-NEXT: movaps %xmm8, %xmm0
3909 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
3910 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm9[0]
3911 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1]
3912 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3913 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3914 ; SSE-NEXT: movaps %xmm8, %xmm13
3915 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
3916 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm9[0]
3917 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1]
3918 ; SSE-NEXT: movaps %xmm8, %xmm9
3919 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3920 ; SSE-NEXT: movaps %xmm8, 224(%rsi)
3921 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3922 ; SSE-NEXT: movaps %xmm8, 160(%rsi)
3923 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3924 ; SSE-NEXT: movaps %xmm14, 96(%rsi)
3925 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3926 ; SSE-NEXT: movaps %xmm14, 32(%rsi)
3927 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3928 ; SSE-NEXT: movaps %xmm8, 240(%rsi)
3929 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3930 ; SSE-NEXT: movaps %xmm8, 176(%rsi)
3931 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3932 ; SSE-NEXT: movaps %xmm14, 112(%rsi)
3933 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3934 ; SSE-NEXT: movaps %xmm14, 48(%rsi)
3935 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3936 ; SSE-NEXT: movaps %xmm8, 192(%rsi)
3937 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3938 ; SSE-NEXT: movaps %xmm14, 128(%rsi)
3939 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3940 ; SSE-NEXT: movaps %xmm14, 64(%rsi)
3941 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3942 ; SSE-NEXT: movaps %xmm8, (%rsi)
3943 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3944 ; SSE-NEXT: movaps %xmm8, 208(%rsi)
3945 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3946 ; SSE-NEXT: movaps %xmm14, 144(%rsi)
3947 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3948 ; SSE-NEXT: movaps %xmm14, 80(%rsi)
3949 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3950 ; SSE-NEXT: movaps %xmm14, 16(%rsi)
3951 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3952 ; SSE-NEXT: movaps %xmm8, 224(%rdx)
3953 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3954 ; SSE-NEXT: movaps %xmm8, 240(%rdx)
3955 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3956 ; SSE-NEXT: movaps %xmm8, 192(%rdx)
3957 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3958 ; SSE-NEXT: movaps %xmm8, 208(%rdx)
3959 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3960 ; SSE-NEXT: movaps %xmm8, 160(%rdx)
3961 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3962 ; SSE-NEXT: movaps %xmm14, 176(%rdx)
3963 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3964 ; SSE-NEXT: movaps %xmm14, 128(%rdx)
3965 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3966 ; SSE-NEXT: movaps %xmm14, 144(%rdx)
3967 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3968 ; SSE-NEXT: movaps %xmm14, 96(%rdx)
3969 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3970 ; SSE-NEXT: movaps %xmm14, 112(%rdx)
3971 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3972 ; SSE-NEXT: movaps %xmm14, 64(%rdx)
3973 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3974 ; SSE-NEXT: movaps %xmm14, 80(%rdx)
3975 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3976 ; SSE-NEXT: movaps %xmm14, 32(%rdx)
3977 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3978 ; SSE-NEXT: movaps %xmm14, 48(%rdx)
3979 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3980 ; SSE-NEXT: movaps %xmm8, (%rdx)
3981 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3982 ; SSE-NEXT: movaps %xmm14, 16(%rdx)
3983 ; SSE-NEXT: movaps %xmm0, 240(%rcx)
3984 ; SSE-NEXT: movaps %xmm12, 224(%rcx)
3985 ; SSE-NEXT: movaps %xmm1, 208(%rcx)
3986 ; SSE-NEXT: movaps %xmm2, 192(%rcx)
3987 ; SSE-NEXT: movaps %xmm3, 176(%rcx)
3988 ; SSE-NEXT: movaps %xmm4, 160(%rcx)
3989 ; SSE-NEXT: movaps %xmm5, 144(%rcx)
3990 ; SSE-NEXT: movaps %xmm6, 128(%rcx)
3991 ; SSE-NEXT: movaps %xmm10, 112(%rcx)
3992 ; SSE-NEXT: movaps %xmm15, 96(%rcx)
3993 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3994 ; SSE-NEXT: movaps %xmm0, 80(%rcx)
3995 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3996 ; SSE-NEXT: movaps %xmm0, 64(%rcx)
3997 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3998 ; SSE-NEXT: movaps %xmm0, 48(%rcx)
3999 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4000 ; SSE-NEXT: movaps %xmm0, 32(%rcx)
4001 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4002 ; SSE-NEXT: movaps %xmm0, 16(%rcx)
4003 ; SSE-NEXT: movaps %xmm13, (%rcx)
4004 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4005 ; SSE-NEXT: movaps %xmm0, 240(%r8)
4006 ; SSE-NEXT: movaps %xmm7, 224(%r8)
4007 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4008 ; SSE-NEXT: movaps %xmm0, 208(%r8)
4009 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4010 ; SSE-NEXT: movaps %xmm0, 192(%r8)
4011 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4012 ; SSE-NEXT: movaps %xmm0, 176(%r8)
4013 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4014 ; SSE-NEXT: movaps %xmm0, 160(%r8)
4015 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4016 ; SSE-NEXT: movaps %xmm0, 144(%r8)
4017 ; SSE-NEXT: movaps %xmm11, 128(%r8)
4018 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4019 ; SSE-NEXT: movaps %xmm0, 112(%r8)
4020 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4021 ; SSE-NEXT: movaps %xmm0, 96(%r8)
4022 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4023 ; SSE-NEXT: movaps %xmm0, 80(%r8)
4024 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4025 ; SSE-NEXT: movaps %xmm0, 64(%r8)
4026 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4027 ; SSE-NEXT: movaps %xmm0, 48(%r8)
4028 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4029 ; SSE-NEXT: movaps %xmm0, 32(%r8)
4030 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4031 ; SSE-NEXT: movaps %xmm0, 16(%r8)
4032 ; SSE-NEXT: movaps %xmm9, (%r8)
4033 ; SSE-NEXT: addq $1224, %rsp # imm = 0x4C8
4036 ; AVX-LABEL: load_i32_stride4_vf64:
4038 ; AVX-NEXT: subq $2184, %rsp # imm = 0x888
4039 ; AVX-NEXT: vmovaps 448(%rdi), %ymm3
4040 ; AVX-NEXT: vmovaps 480(%rdi), %ymm4
4041 ; AVX-NEXT: vmovaps 192(%rdi), %ymm14
4042 ; AVX-NEXT: vmovaps 224(%rdi), %ymm9
4043 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm9[2,3,0,1]
4044 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
4045 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm14[2,3,0,1]
4046 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm14[0],ymm7[1],ymm14[1],ymm7[4],ymm14[4],ymm7[5],ymm14[5]
4047 ; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4048 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
4049 ; AVX-NEXT: vmovaps 160(%rdi), %xmm1
4050 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4051 ; AVX-NEXT: vmovaps 176(%rdi), %xmm13
4052 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm1[0]
4053 ; AVX-NEXT: vmovaps 144(%rdi), %xmm2
4054 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4055 ; AVX-NEXT: vmovaps 128(%rdi), %xmm12
4056 ; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1]
4057 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0]
4058 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4059 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4060 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1]
4061 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4062 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[2],ymm0[2]
4063 ; AVX-NEXT: vmovaps %ymm4, %ymm5
4064 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4065 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4066 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1]
4067 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4068 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5]
4069 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
4070 ; AVX-NEXT: vmovaps 416(%rdi), %xmm2
4071 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4072 ; AVX-NEXT: vmovaps 432(%rdi), %xmm1
4073 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4074 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
4075 ; AVX-NEXT: vmovaps 400(%rdi), %xmm2
4076 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4077 ; AVX-NEXT: vmovaps 384(%rdi), %xmm6
4078 ; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
4079 ; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4080 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0]
4081 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4082 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4083 ; AVX-NEXT: vmovaps 704(%rdi), %ymm2
4084 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4085 ; AVX-NEXT: vmovaps 736(%rdi), %ymm1
4086 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4087 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
4088 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4089 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
4090 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1]
4091 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4092 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5]
4093 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
4094 ; AVX-NEXT: vmovaps 672(%rdi), %xmm2
4095 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4096 ; AVX-NEXT: vmovaps 688(%rdi), %xmm1
4097 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4098 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
4099 ; AVX-NEXT: vmovaps 656(%rdi), %xmm3
4100 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4101 ; AVX-NEXT: vmovaps 640(%rdi), %xmm2
4102 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4103 ; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
4104 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0]
4105 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4106 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4107 ; AVX-NEXT: vmovaps 960(%rdi), %ymm2
4108 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4109 ; AVX-NEXT: vmovaps 992(%rdi), %ymm1
4110 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4111 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
4112 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4113 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
4114 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1]
4115 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4116 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5]
4117 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
4118 ; AVX-NEXT: vmovaps 928(%rdi), %xmm2
4119 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4120 ; AVX-NEXT: vmovaps 944(%rdi), %xmm1
4121 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4122 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
4123 ; AVX-NEXT: vmovaps 912(%rdi), %xmm3
4124 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4125 ; AVX-NEXT: vmovaps 896(%rdi), %xmm2
4126 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4127 ; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
4128 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0]
4129 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4130 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4131 ; AVX-NEXT: vmovaps 320(%rdi), %ymm2
4132 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4133 ; AVX-NEXT: vmovaps 352(%rdi), %ymm1
4134 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4135 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
4136 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4137 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
4138 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1]
4139 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4140 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5]
4141 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
4142 ; AVX-NEXT: vmovaps 288(%rdi), %xmm2
4143 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4144 ; AVX-NEXT: vmovaps 304(%rdi), %xmm1
4145 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4146 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
4147 ; AVX-NEXT: vmovaps 272(%rdi), %xmm3
4148 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4149 ; AVX-NEXT: vmovaps 256(%rdi), %xmm2
4150 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4151 ; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
4152 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0]
4153 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4154 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4155 ; AVX-NEXT: vmovaps 576(%rdi), %ymm2
4156 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4157 ; AVX-NEXT: vmovaps 608(%rdi), %ymm1
4158 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4159 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
4160 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4161 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
4162 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1]
4163 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4164 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5]
4165 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
4166 ; AVX-NEXT: vmovaps 544(%rdi), %xmm2
4167 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4168 ; AVX-NEXT: vmovaps 560(%rdi), %xmm1
4169 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4170 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
4171 ; AVX-NEXT: vmovaps 528(%rdi), %xmm3
4172 ; AVX-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill
4173 ; AVX-NEXT: vmovaps 512(%rdi), %xmm2
4174 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4175 ; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
4176 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0]
4177 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4178 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4179 ; AVX-NEXT: vmovaps 832(%rdi), %ymm2
4180 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4181 ; AVX-NEXT: vmovaps 864(%rdi), %ymm1
4182 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4183 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
4184 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4185 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
4186 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1]
4187 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4188 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5]
4189 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
4190 ; AVX-NEXT: vmovaps 800(%rdi), %xmm2
4191 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4192 ; AVX-NEXT: vmovaps 816(%rdi), %xmm1
4193 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4194 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
4195 ; AVX-NEXT: vmovaps 784(%rdi), %xmm2
4196 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4197 ; AVX-NEXT: vmovaps 768(%rdi), %xmm3
4198 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4199 ; AVX-NEXT: vunpcklps {{.*#+}} xmm10 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4200 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,0]
4201 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4202 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4203 ; AVX-NEXT: vmovaps 64(%rdi), %ymm0
4204 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4205 ; AVX-NEXT: vmovaps 96(%rdi), %ymm1
4206 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4207 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
4208 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4209 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
4210 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
4211 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4212 ; AVX-NEXT: vunpcklps {{.*#+}} ymm11 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
4213 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1],ymm10[2,0],ymm11[4,5],ymm10[6,4]
4214 ; AVX-NEXT: vmovaps (%rdi), %xmm2
4215 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4216 ; AVX-NEXT: vmovaps 16(%rdi), %xmm3
4217 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4218 ; AVX-NEXT: vmovaps 32(%rdi), %xmm4
4219 ; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4220 ; AVX-NEXT: vmovaps 48(%rdi), %xmm0
4221 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4222 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
4223 ; AVX-NEXT: vunpcklps {{.*#+}} xmm15 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
4224 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,0]
4225 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4226 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4227 ; AVX-NEXT: vmovaps %ymm8, %ymm10
4228 ; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4229 ; AVX-NEXT: vmovaps %ymm9, %ymm2
4230 ; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4231 ; AVX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5]
4232 ; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4233 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,0],ymm7[1,0],ymm14[5,4],ymm7[5,4]
4234 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
4235 ; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4236 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4237 ; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm12[1],xmm4[1],zero,zero
4238 ; AVX-NEXT: vmovaps %xmm13, %xmm7
4239 ; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4240 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4241 ; AVX-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm13[0],xmm3[1],xmm13[1]
4242 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
4243 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4244 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4245 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
4246 ; AVX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[4],ymm5[4],ymm8[5],ymm5[5]
4247 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
4248 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
4249 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,0],ymm5[1,0],ymm9[5,4],ymm5[5,4]
4250 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
4251 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
4252 ; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm6[1],xmm13[1],zero,zero
4253 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
4254 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4255 ; AVX-NEXT: vunpcklps {{.*#+}} xmm15 = xmm6[0],xmm11[0],xmm6[1],xmm11[1]
4256 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
4257 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4258 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4259 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4260 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4261 ; AVX-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
4262 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4263 ; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
4264 ; AVX-NEXT: # ymm1 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4]
4265 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
4266 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4267 ; AVX-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
4268 ; AVX-NEXT: # xmm1 = mem[0],xmm1[1],zero,zero
4269 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4270 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
4271 ; AVX-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1]
4272 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
4273 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4274 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4275 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4276 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4277 ; AVX-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
4278 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4279 ; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
4280 ; AVX-NEXT: # ymm1 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4]
4281 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
4282 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4283 ; AVX-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
4284 ; AVX-NEXT: # xmm1 = mem[0],xmm1[1],zero,zero
4285 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4286 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
4287 ; AVX-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1]
4288 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
4289 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4290 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4291 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4292 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4293 ; AVX-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
4294 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4295 ; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
4296 ; AVX-NEXT: # ymm1 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4]
4297 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
4298 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4299 ; AVX-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
4300 ; AVX-NEXT: # xmm1 = mem[0],xmm1[1],zero,zero
4301 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4302 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
4303 ; AVX-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1]
4304 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
4305 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4306 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4307 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4308 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4309 ; AVX-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
4310 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4311 ; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
4312 ; AVX-NEXT: # ymm1 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4]
4313 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
4314 ; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
4315 ; AVX-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
4316 ; AVX-NEXT: # xmm1 = mem[0],xmm1[1],zero,zero
4317 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4318 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
4319 ; AVX-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1]
4320 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
4321 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4322 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4323 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4324 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4325 ; AVX-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
4326 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4327 ; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
4328 ; AVX-NEXT: # ymm1 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4]
4329 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
4330 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4331 ; AVX-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
4332 ; AVX-NEXT: # xmm1 = mem[0],xmm1[1],zero,zero
4333 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4334 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
4335 ; AVX-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1]
4336 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
4337 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4338 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4339 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4340 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4341 ; AVX-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
4342 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4343 ; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
4344 ; AVX-NEXT: # ymm1 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4]
4345 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
4346 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4347 ; AVX-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
4348 ; AVX-NEXT: # xmm1 = mem[0],xmm1[1],zero,zero
4349 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4350 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
4351 ; AVX-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1]
4352 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
4353 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4354 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4355 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm10[1],ymm2[3],ymm10[3]
4356 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
4357 ; AVX-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm14[2],ymm10[3],ymm14[3],ymm10[6],ymm14[6],ymm10[7],ymm14[7]
4358 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
4359 ; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm12[2],xmm4[2],xmm12[3],xmm4[3]
4360 ; AVX-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm3[2],xmm7[2]
4361 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
4362 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4363 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4364 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
4365 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3]
4366 ; AVX-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm9[2],ymm5[3],ymm9[3],ymm5[6],ymm9[6],ymm5[7],ymm9[7]
4367 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
4368 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
4369 ; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
4370 ; AVX-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm6[2],xmm11[2]
4371 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
4372 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4373 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4374 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
4375 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
4376 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm8[1],ymm4[3],ymm8[3]
4377 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
4378 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
4379 ; AVX-NEXT: vunpckhps {{.*#+}} ymm1 = ymm11[2],ymm5[2],ymm11[3],ymm5[3],ymm11[6],ymm5[6],ymm11[7],ymm5[7]
4380 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
4381 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
4382 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
4383 ; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm12[2],xmm13[3],xmm12[3]
4384 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4385 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4386 ; AVX-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm2[2],xmm3[2]
4387 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
4388 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4389 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4390 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
4391 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
4392 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm9[1],ymm6[3],ymm9[3]
4393 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4394 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
4395 ; AVX-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
4396 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
4397 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4398 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
4399 ; AVX-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
4400 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4401 ; AVX-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
4402 ; AVX-NEXT: # xmm15 = zero,zero,xmm15[2],mem[0]
4403 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
4404 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4405 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4406 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4407 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4408 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
4409 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4410 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
4411 ; AVX-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
4412 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
4413 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4414 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
4415 ; AVX-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
4416 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4417 ; AVX-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
4418 ; AVX-NEXT: # xmm15 = zero,zero,xmm15[2],mem[0]
4419 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
4420 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4421 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4422 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4423 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4424 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
4425 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4426 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
4427 ; AVX-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
4428 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
4429 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4430 ; AVX-NEXT: vunpckhps (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload
4431 ; AVX-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
4432 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4433 ; AVX-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
4434 ; AVX-NEXT: # xmm15 = zero,zero,xmm15[2],mem[0]
4435 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
4436 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4437 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4438 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4439 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4440 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
4441 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4442 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
4443 ; AVX-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
4444 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
4445 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4446 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
4447 ; AVX-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
4448 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4449 ; AVX-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
4450 ; AVX-NEXT: # xmm15 = zero,zero,xmm15[2],mem[0]
4451 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
4452 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4453 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4454 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4455 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4456 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
4457 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4458 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
4459 ; AVX-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
4460 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
4461 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4462 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
4463 ; AVX-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
4464 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4465 ; AVX-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
4466 ; AVX-NEXT: # xmm15 = zero,zero,xmm15[2],mem[0]
4467 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
4468 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4469 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4470 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4471 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4472 ; AVX-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
4473 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4474 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm10[3,0],ymm1[7,4],ymm10[7,4]
4475 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
4476 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4477 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
4478 ; AVX-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
4479 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4480 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm15 # 16-byte Folded Reload
4481 ; AVX-NEXT: # xmm15 = xmm10[3,0],mem[3,0]
4482 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm15[2,0],xmm1[2,3]
4483 ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4484 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4485 ; AVX-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm7[2],ymm0[3],ymm7[3],ymm0[6],ymm7[6],ymm0[7],ymm7[7]
4486 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4487 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
4488 ; AVX-NEXT: # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4]
4489 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
4490 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4491 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
4492 ; AVX-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
4493 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4494 ; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm7[3,0],xmm14[3,0]
4495 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm15[2,0],xmm1[2,3]
4496 ; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4497 ; AVX-NEXT: vunpckhps {{.*#+}} ymm0 = ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[6],ymm4[6],ymm8[7],ymm4[7]
4498 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,0],ymm11[3,0],ymm5[7,4],ymm11[7,4]
4499 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
4500 ; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
4501 ; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm12[3,0],xmm13[3,0]
4502 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm14[2,0],xmm1[2,3]
4503 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4504 ; AVX-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7]
4505 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4506 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload
4507 ; AVX-NEXT: # ymm12 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4]
4508 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm12[2,0],ymm1[2,3],ymm12[6,4],ymm1[6,7]
4509 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4510 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm12 # 16-byte Folded Reload
4511 ; AVX-NEXT: # xmm12 = xmm2[2],mem[2],xmm2[3],mem[3]
4512 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4513 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm13 # 16-byte Folded Reload
4514 ; AVX-NEXT: # xmm13 = xmm2[3,0],mem[3,0]
4515 ; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm13[2,0],xmm12[2,3]
4516 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
4517 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4518 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload
4519 ; AVX-NEXT: # ymm6 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7]
4520 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4521 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload
4522 ; AVX-NEXT: # ymm9 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4]
4523 ; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm9[2,0],ymm6[2,3],ymm9[6,4],ymm6[6,7]
4524 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4525 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm9 # 16-byte Folded Reload
4526 ; AVX-NEXT: # xmm9 = xmm2[2],mem[2],xmm2[3],mem[3]
4527 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4528 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm12 # 16-byte Folded Reload
4529 ; AVX-NEXT: # xmm12 = xmm2[3,0],mem[3,0]
4530 ; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm12[2,0],xmm9[2,3]
4531 ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
4532 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4533 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload
4534 ; AVX-NEXT: # ymm4 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7]
4535 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4536 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload
4537 ; AVX-NEXT: # ymm8 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4]
4538 ; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm8[2,0],ymm4[2,3],ymm8[6,4],ymm4[6,7]
4539 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4540 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm8 # 16-byte Folded Reload
4541 ; AVX-NEXT: # xmm8 = xmm2[2],mem[2],xmm2[3],mem[3]
4542 ; AVX-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload
4543 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm9 # 16-byte Folded Reload
4544 ; AVX-NEXT: # xmm9 = xmm2[3,0],mem[3,0]
4545 ; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm9[2,0],xmm8[2,3]
4546 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
4547 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4548 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
4549 ; AVX-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7]
4550 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4551 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload
4552 ; AVX-NEXT: # ymm7 = ymm3[3,0],mem[3,0],ymm3[7,4],mem[7,4]
4553 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm7[2,0],ymm2[2,3],ymm7[6,4],ymm2[6,7]
4554 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4555 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload
4556 ; AVX-NEXT: # xmm7 = xmm3[2],mem[2],xmm3[3],mem[3]
4557 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4558 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm8 # 16-byte Folded Reload
4559 ; AVX-NEXT: # xmm8 = xmm3[3,0],mem[3,0]
4560 ; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm8[2,0],xmm7[2,3]
4561 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
4562 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4563 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
4564 ; AVX-NEXT: # ymm3 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7]
4565 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
4566 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
4567 ; AVX-NEXT: # ymm5 = ymm5[3,0],mem[3,0],ymm5[7,4],mem[7,4]
4568 ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm5[2,0],ymm3[2,3],ymm5[6,4],ymm3[6,7]
4569 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4570 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
4571 ; AVX-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
4572 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4573 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
4574 ; AVX-NEXT: # xmm7 = xmm7[3,0],mem[3,0]
4575 ; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm7[2,0],xmm5[2,3]
4576 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
4577 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
4578 ; AVX-NEXT: vmovaps %ymm5, 192(%rsi)
4579 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
4580 ; AVX-NEXT: vmovaps %ymm5, 128(%rsi)
4581 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
4582 ; AVX-NEXT: vmovaps %ymm5, 64(%rsi)
4583 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
4584 ; AVX-NEXT: vmovaps %ymm5, (%rsi)
4585 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
4586 ; AVX-NEXT: vmovaps %ymm5, 224(%rsi)
4587 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
4588 ; AVX-NEXT: vmovaps %ymm5, 160(%rsi)
4589 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
4590 ; AVX-NEXT: vmovaps %ymm5, 96(%rsi)
4591 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
4592 ; AVX-NEXT: vmovaps %ymm5, 32(%rsi)
4593 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
4594 ; AVX-NEXT: vmovaps %ymm5, 192(%rdx)
4595 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
4596 ; AVX-NEXT: vmovaps %ymm5, 128(%rdx)
4597 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
4598 ; AVX-NEXT: vmovaps %ymm5, 64(%rdx)
4599 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
4600 ; AVX-NEXT: vmovaps %ymm5, (%rdx)
4601 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
4602 ; AVX-NEXT: vmovaps %ymm5, 224(%rdx)
4603 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
4604 ; AVX-NEXT: vmovaps %ymm5, 160(%rdx)
4605 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
4606 ; AVX-NEXT: vmovaps %ymm5, 96(%rdx)
4607 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
4608 ; AVX-NEXT: vmovaps %ymm5, 32(%rdx)
4609 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
4610 ; AVX-NEXT: vmovaps %ymm5, 192(%rcx)
4611 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
4612 ; AVX-NEXT: vmovaps %ymm5, 128(%rcx)
4613 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
4614 ; AVX-NEXT: vmovaps %ymm5, 64(%rcx)
4615 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
4616 ; AVX-NEXT: vmovaps %ymm5, (%rcx)
4617 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
4618 ; AVX-NEXT: vmovaps %ymm5, 224(%rcx)
4619 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
4620 ; AVX-NEXT: vmovaps %ymm5, 160(%rcx)
4621 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
4622 ; AVX-NEXT: vmovaps %ymm5, 96(%rcx)
4623 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
4624 ; AVX-NEXT: vmovaps %ymm5, 32(%rcx)
4625 ; AVX-NEXT: vmovaps %ymm2, 192(%r8)
4626 ; AVX-NEXT: vmovaps %ymm4, 128(%r8)
4627 ; AVX-NEXT: vmovaps %ymm6, 64(%r8)
4628 ; AVX-NEXT: vmovaps %ymm3, (%r8)
4629 ; AVX-NEXT: vmovaps %ymm1, 224(%r8)
4630 ; AVX-NEXT: vmovaps %ymm0, 160(%r8)
4631 ; AVX-NEXT: vmovaps %ymm15, 96(%r8)
4632 ; AVX-NEXT: vmovaps %ymm10, 32(%r8)
4633 ; AVX-NEXT: addq $2184, %rsp # imm = 0x888
4634 ; AVX-NEXT: vzeroupper
4637 ; AVX2-LABEL: load_i32_stride4_vf64:
4639 ; AVX2-NEXT: subq $1944, %rsp # imm = 0x798
4640 ; AVX2-NEXT: vmovaps 704(%rdi), %ymm13
4641 ; AVX2-NEXT: vmovaps 416(%rdi), %ymm8
4642 ; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4643 ; AVX2-NEXT: vmovaps 448(%rdi), %ymm4
4644 ; AVX2-NEXT: vmovaps 480(%rdi), %ymm15
4645 ; AVX2-NEXT: vmovaps 160(%rdi), %ymm9
4646 ; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4647 ; AVX2-NEXT: vmovaps 192(%rdi), %ymm10
4648 ; AVX2-NEXT: vmovaps 224(%rdi), %ymm14
4649 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0,4,0,4,0,4,0,4]
4650 ; AVX2-NEXT: vpermps %ymm14, %ymm2, %ymm0
4651 ; AVX2-NEXT: vpermps %ymm10, %ymm2, %ymm1
4652 ; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4653 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
4654 ; AVX2-NEXT: vmovaps 144(%rdi), %xmm3
4655 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4656 ; AVX2-NEXT: vmovaps 128(%rdi), %xmm1
4657 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4658 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
4659 ; AVX2-NEXT: vpermps %ymm9, %ymm2, %ymm3
4660 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
4661 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4662 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4663 ; AVX2-NEXT: vpermps %ymm15, %ymm2, %ymm0
4664 ; AVX2-NEXT: vpermps %ymm4, %ymm2, %ymm1
4665 ; AVX2-NEXT: vmovaps %ymm4, %ymm9
4666 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
4667 ; AVX2-NEXT: vmovaps 400(%rdi), %xmm3
4668 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4669 ; AVX2-NEXT: vmovaps 384(%rdi), %xmm1
4670 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4671 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
4672 ; AVX2-NEXT: vpermps %ymm8, %ymm2, %ymm3
4673 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
4674 ; AVX2-NEXT: vmovaps 736(%rdi), %ymm11
4675 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4676 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4677 ; AVX2-NEXT: vpermps %ymm11, %ymm2, %ymm0
4678 ; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4679 ; AVX2-NEXT: vpermps %ymm13, %ymm2, %ymm1
4680 ; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4681 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
4682 ; AVX2-NEXT: vmovaps 672(%rdi), %ymm4
4683 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4684 ; AVX2-NEXT: vmovaps 656(%rdi), %xmm3
4685 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4686 ; AVX2-NEXT: vmovaps 640(%rdi), %xmm1
4687 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4688 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
4689 ; AVX2-NEXT: vpermps %ymm4, %ymm2, %ymm3
4690 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
4691 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4692 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4693 ; AVX2-NEXT: vmovaps 960(%rdi), %ymm3
4694 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4695 ; AVX2-NEXT: vmovaps 992(%rdi), %ymm8
4696 ; AVX2-NEXT: vpermps %ymm8, %ymm2, %ymm0
4697 ; AVX2-NEXT: vpermps %ymm3, %ymm2, %ymm1
4698 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
4699 ; AVX2-NEXT: vmovaps 928(%rdi), %ymm4
4700 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4701 ; AVX2-NEXT: vmovaps 912(%rdi), %xmm3
4702 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4703 ; AVX2-NEXT: vmovaps 896(%rdi), %xmm1
4704 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4705 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
4706 ; AVX2-NEXT: vpermps %ymm4, %ymm2, %ymm3
4707 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
4708 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4709 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4710 ; AVX2-NEXT: vmovaps 320(%rdi), %ymm3
4711 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4712 ; AVX2-NEXT: vmovaps 352(%rdi), %ymm0
4713 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4714 ; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
4715 ; AVX2-NEXT: vpermps %ymm3, %ymm2, %ymm1
4716 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
4717 ; AVX2-NEXT: vmovaps 288(%rdi), %ymm4
4718 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4719 ; AVX2-NEXT: vmovaps 272(%rdi), %xmm3
4720 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4721 ; AVX2-NEXT: vmovaps 256(%rdi), %xmm1
4722 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4723 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
4724 ; AVX2-NEXT: vpermps %ymm4, %ymm2, %ymm3
4725 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
4726 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4727 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4728 ; AVX2-NEXT: vmovaps 576(%rdi), %ymm3
4729 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4730 ; AVX2-NEXT: vmovaps 608(%rdi), %ymm0
4731 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4732 ; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
4733 ; AVX2-NEXT: vpermps %ymm3, %ymm2, %ymm1
4734 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
4735 ; AVX2-NEXT: vmovaps 544(%rdi), %ymm4
4736 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4737 ; AVX2-NEXT: vmovaps 528(%rdi), %xmm3
4738 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4739 ; AVX2-NEXT: vmovaps 512(%rdi), %xmm1
4740 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4741 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
4742 ; AVX2-NEXT: vpermps %ymm4, %ymm2, %ymm3
4743 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
4744 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4745 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4746 ; AVX2-NEXT: vmovaps 832(%rdi), %ymm3
4747 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4748 ; AVX2-NEXT: vmovaps 864(%rdi), %ymm0
4749 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4750 ; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
4751 ; AVX2-NEXT: vpermps %ymm3, %ymm2, %ymm1
4752 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
4753 ; AVX2-NEXT: vmovaps 800(%rdi), %ymm4
4754 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4755 ; AVX2-NEXT: vmovaps 784(%rdi), %xmm3
4756 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4757 ; AVX2-NEXT: vmovaps 768(%rdi), %xmm1
4758 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4759 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
4760 ; AVX2-NEXT: vpermps %ymm4, %ymm2, %ymm3
4761 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
4762 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4763 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4764 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm1
4765 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm7
4766 ; AVX2-NEXT: vpermps %ymm7, %ymm2, %ymm0
4767 ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4768 ; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm4
4769 ; AVX2-NEXT: vmovaps %ymm1, %ymm3
4770 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4771 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
4772 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm1
4773 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4774 ; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm2
4775 ; AVX2-NEXT: vmovaps (%rdi), %xmm1
4776 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4777 ; AVX2-NEXT: vmovaps 16(%rdi), %xmm4
4778 ; AVX2-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4779 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm4 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
4780 ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
4781 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
4782 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4783 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,5,1,5,1,5,1,5]
4784 ; AVX2-NEXT: vpermps %ymm14, %ymm2, %ymm0
4785 ; AVX2-NEXT: vpermps %ymm10, %ymm2, %ymm4
4786 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
4787 ; AVX2-NEXT: vmovaps 128(%rdi), %ymm1
4788 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4789 ; AVX2-NEXT: vmovaps 176(%rdi), %xmm5
4790 ; AVX2-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4791 ; AVX2-NEXT: vmovaps 160(%rdi), %xmm4
4792 ; AVX2-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4793 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
4794 ; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm5
4795 ; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
4796 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
4797 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4798 ; AVX2-NEXT: vpermps %ymm15, %ymm2, %ymm0
4799 ; AVX2-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4800 ; AVX2-NEXT: vpermps %ymm9, %ymm2, %ymm4
4801 ; AVX2-NEXT: vmovaps %ymm9, %ymm10
4802 ; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4803 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
4804 ; AVX2-NEXT: vmovaps 384(%rdi), %ymm1
4805 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4806 ; AVX2-NEXT: vmovaps 432(%rdi), %xmm5
4807 ; AVX2-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4808 ; AVX2-NEXT: vmovaps 416(%rdi), %xmm4
4809 ; AVX2-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4810 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
4811 ; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm5
4812 ; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
4813 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
4814 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4815 ; AVX2-NEXT: vpermps %ymm11, %ymm2, %ymm0
4816 ; AVX2-NEXT: vpermps %ymm13, %ymm2, %ymm4
4817 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
4818 ; AVX2-NEXT: vmovaps 640(%rdi), %ymm1
4819 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4820 ; AVX2-NEXT: vmovaps 688(%rdi), %xmm5
4821 ; AVX2-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4822 ; AVX2-NEXT: vmovaps 672(%rdi), %xmm4
4823 ; AVX2-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4824 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
4825 ; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm5
4826 ; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
4827 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
4828 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4829 ; AVX2-NEXT: vpermps %ymm8, %ymm2, %ymm0
4830 ; AVX2-NEXT: vmovaps %ymm8, %ymm11
4831 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
4832 ; AVX2-NEXT: vpermps %ymm8, %ymm2, %ymm4
4833 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
4834 ; AVX2-NEXT: vmovaps 896(%rdi), %ymm1
4835 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4836 ; AVX2-NEXT: vmovaps 944(%rdi), %xmm5
4837 ; AVX2-NEXT: vmovaps %xmm5, (%rsp) # 16-byte Spill
4838 ; AVX2-NEXT: vmovaps 928(%rdi), %xmm4
4839 ; AVX2-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4840 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
4841 ; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm5
4842 ; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
4843 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
4844 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4845 ; AVX2-NEXT: vpermps %ymm7, %ymm2, %ymm0
4846 ; AVX2-NEXT: vpermps %ymm3, %ymm2, %ymm4
4847 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
4848 ; AVX2-NEXT: vmovaps (%rdi), %ymm1
4849 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4850 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm3
4851 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4852 ; AVX2-NEXT: vmovaps 48(%rdi), %xmm4
4853 ; AVX2-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4854 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm4 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
4855 ; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm5
4856 ; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
4857 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
4858 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4859 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
4860 ; AVX2-NEXT: vpermps %ymm6, %ymm2, %ymm0
4861 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
4862 ; AVX2-NEXT: vpermps %ymm7, %ymm2, %ymm4
4863 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
4864 ; AVX2-NEXT: vmovaps 256(%rdi), %ymm1
4865 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4866 ; AVX2-NEXT: vmovaps 304(%rdi), %xmm3
4867 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4868 ; AVX2-NEXT: vmovaps 288(%rdi), %xmm4
4869 ; AVX2-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4870 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
4871 ; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm5
4872 ; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
4873 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
4874 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4875 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
4876 ; AVX2-NEXT: vpermps %ymm13, %ymm2, %ymm0
4877 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
4878 ; AVX2-NEXT: vpermps %ymm12, %ymm2, %ymm4
4879 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
4880 ; AVX2-NEXT: vmovaps 512(%rdi), %ymm1
4881 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4882 ; AVX2-NEXT: vmovaps 560(%rdi), %xmm3
4883 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4884 ; AVX2-NEXT: vmovaps 544(%rdi), %xmm4
4885 ; AVX2-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4886 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
4887 ; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm5
4888 ; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
4889 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
4890 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4891 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4892 ; AVX2-NEXT: vpermps %ymm3, %ymm2, %ymm0
4893 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4894 ; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm4
4895 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
4896 ; AVX2-NEXT: vmovaps 768(%rdi), %ymm4
4897 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4898 ; AVX2-NEXT: vpermps %ymm4, %ymm2, %ymm2
4899 ; AVX2-NEXT: vmovaps 816(%rdi), %xmm5
4900 ; AVX2-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4901 ; AVX2-NEXT: vmovaps 800(%rdi), %xmm4
4902 ; AVX2-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4903 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
4904 ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
4905 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
4906 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4907 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,6,2,6,2,6,2,6]
4908 ; AVX2-NEXT: vpermps %ymm14, %ymm0, %ymm2
4909 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
4910 ; AVX2-NEXT: vpermps %ymm5, %ymm0, %ymm4
4911 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
4912 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4913 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
4914 ; AVX2-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
4915 ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
4916 ; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,3]
4917 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
4918 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4919 ; AVX2-NEXT: vpermps %ymm15, %ymm0, %ymm2
4920 ; AVX2-NEXT: vpermps %ymm10, %ymm0, %ymm4
4921 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
4922 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4923 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
4924 ; AVX2-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
4925 ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
4926 ; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,3]
4927 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
4928 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4929 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
4930 ; AVX2-NEXT: vpermps %ymm15, %ymm0, %ymm2
4931 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
4932 ; AVX2-NEXT: vpermps %ymm10, %ymm0, %ymm4
4933 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
4934 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4935 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
4936 ; AVX2-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
4937 ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
4938 ; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,3]
4939 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
4940 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4941 ; AVX2-NEXT: vpermps %ymm11, %ymm0, %ymm2
4942 ; AVX2-NEXT: vpermps %ymm8, %ymm0, %ymm4
4943 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
4944 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4945 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
4946 ; AVX2-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
4947 ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
4948 ; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,3]
4949 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
4950 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4951 ; AVX2-NEXT: vpermps %ymm6, %ymm0, %ymm2
4952 ; AVX2-NEXT: vpermps %ymm7, %ymm0, %ymm4
4953 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
4954 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4955 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
4956 ; AVX2-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
4957 ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
4958 ; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,3]
4959 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
4960 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4961 ; AVX2-NEXT: vpermps %ymm13, %ymm0, %ymm2
4962 ; AVX2-NEXT: vpermps %ymm12, %ymm0, %ymm4
4963 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
4964 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4965 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
4966 ; AVX2-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
4967 ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
4968 ; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,3]
4969 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
4970 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4971 ; AVX2-NEXT: vpermps %ymm3, %ymm0, %ymm2
4972 ; AVX2-NEXT: vpermps %ymm1, %ymm0, %ymm4
4973 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
4974 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4975 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
4976 ; AVX2-NEXT: # xmm4 = xmm1[2],mem[2],xmm1[3],mem[3]
4977 ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
4978 ; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,3]
4979 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7]
4980 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4981 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4982 ; AVX2-NEXT: vpermps %ymm1, %ymm0, %ymm2
4983 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4984 ; AVX2-NEXT: vpermps %ymm3, %ymm0, %ymm4
4985 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
4986 ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4987 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4988 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
4989 ; AVX2-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
4990 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
4991 ; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm2[4,5,6,7]
4992 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3,7,3,7,3,7,3,7]
4993 ; AVX2-NEXT: vpermps %ymm14, %ymm0, %ymm2
4994 ; AVX2-NEXT: vpermps %ymm5, %ymm0, %ymm4
4995 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
4996 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4997 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
4998 ; AVX2-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
4999 ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
5000 ; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm14[0,1],xmm4[2,3]
5001 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
5002 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5003 ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
5004 ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
5005 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
5006 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5007 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
5008 ; AVX2-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
5009 ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
5010 ; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm14[0,1],xmm4[2,3]
5011 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm2[4,5,6,7]
5012 ; AVX2-NEXT: vpermps %ymm15, %ymm0, %ymm2
5013 ; AVX2-NEXT: vpermps %ymm10, %ymm0, %ymm14
5014 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5],ymm2[6,7]
5015 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5016 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload
5017 ; AVX2-NEXT: # xmm14 = xmm5[2],mem[2],xmm5[3],mem[3]
5018 ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
5019 ; AVX2-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
5020 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm2[4,5,6,7]
5021 ; AVX2-NEXT: vpermps %ymm11, %ymm0, %ymm11
5022 ; AVX2-NEXT: vpermps %ymm8, %ymm0, %ymm14
5023 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1,2,3,4,5],ymm11[6,7]
5024 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5025 ; AVX2-NEXT: vunpckhps (%rsp), %xmm5, %xmm14 # 16-byte Folded Reload
5026 ; AVX2-NEXT: # xmm14 = xmm5[2],mem[2],xmm5[3],mem[3]
5027 ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
5028 ; AVX2-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
5029 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7]
5030 ; AVX2-NEXT: vpermps %ymm1, %ymm0, %ymm1
5031 ; AVX2-NEXT: vpermps %ymm3, %ymm0, %ymm3
5032 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
5033 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5034 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
5035 ; AVX2-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
5036 ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
5037 ; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0,1],xmm3[2,3]
5038 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
5039 ; AVX2-NEXT: vpermps %ymm6, %ymm0, %ymm3
5040 ; AVX2-NEXT: vpermps %ymm7, %ymm0, %ymm14
5041 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5],ymm3[6,7]
5042 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5043 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload
5044 ; AVX2-NEXT: # xmm14 = xmm5[2],mem[2],xmm5[3],mem[3]
5045 ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
5046 ; AVX2-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
5047 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm3[4,5,6,7]
5048 ; AVX2-NEXT: vpermps %ymm13, %ymm0, %ymm10
5049 ; AVX2-NEXT: vpermps %ymm12, %ymm0, %ymm14
5050 ; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm10[6,7]
5051 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5052 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm13 # 16-byte Folded Reload
5053 ; AVX2-NEXT: # xmm13 = xmm5[2],mem[2],xmm5[3],mem[3]
5054 ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
5055 ; AVX2-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3]
5056 ; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7]
5057 ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
5058 ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
5059 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
5060 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5061 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
5062 ; AVX2-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
5063 ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
5064 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3]
5065 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
5066 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5067 ; AVX2-NEXT: vmovaps %ymm5, 192(%rsi)
5068 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5069 ; AVX2-NEXT: vmovaps %ymm5, 128(%rsi)
5070 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5071 ; AVX2-NEXT: vmovaps %ymm5, 64(%rsi)
5072 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5073 ; AVX2-NEXT: vmovaps %ymm5, (%rsi)
5074 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5075 ; AVX2-NEXT: vmovaps %ymm5, 224(%rsi)
5076 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5077 ; AVX2-NEXT: vmovaps %ymm5, 160(%rsi)
5078 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5079 ; AVX2-NEXT: vmovaps %ymm5, 96(%rsi)
5080 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5081 ; AVX2-NEXT: vmovaps %ymm5, 32(%rsi)
5082 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5083 ; AVX2-NEXT: vmovaps %ymm5, 192(%rdx)
5084 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5085 ; AVX2-NEXT: vmovaps %ymm5, 128(%rdx)
5086 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5087 ; AVX2-NEXT: vmovaps %ymm5, 64(%rdx)
5088 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5089 ; AVX2-NEXT: vmovaps %ymm5, (%rdx)
5090 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5091 ; AVX2-NEXT: vmovaps %ymm5, 224(%rdx)
5092 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5093 ; AVX2-NEXT: vmovaps %ymm5, 160(%rdx)
5094 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5095 ; AVX2-NEXT: vmovaps %ymm5, 96(%rdx)
5096 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5097 ; AVX2-NEXT: vmovaps %ymm5, 32(%rdx)
5098 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5099 ; AVX2-NEXT: vmovaps %ymm5, 192(%rcx)
5100 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5101 ; AVX2-NEXT: vmovaps %ymm5, 128(%rcx)
5102 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5103 ; AVX2-NEXT: vmovaps %ymm5, 64(%rcx)
5104 ; AVX2-NEXT: vmovaps %ymm9, (%rcx)
5105 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5106 ; AVX2-NEXT: vmovaps %ymm5, 224(%rcx)
5107 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5108 ; AVX2-NEXT: vmovaps %ymm5, 160(%rcx)
5109 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5110 ; AVX2-NEXT: vmovaps %ymm5, 96(%rcx)
5111 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5112 ; AVX2-NEXT: vmovaps %ymm5, 32(%rcx)
5113 ; AVX2-NEXT: vmovaps %ymm0, 192(%r8)
5114 ; AVX2-NEXT: vmovaps %ymm10, 128(%r8)
5115 ; AVX2-NEXT: vmovaps %ymm3, 64(%r8)
5116 ; AVX2-NEXT: vmovaps %ymm1, (%r8)
5117 ; AVX2-NEXT: vmovaps %ymm11, 224(%r8)
5118 ; AVX2-NEXT: vmovaps %ymm2, 160(%r8)
5119 ; AVX2-NEXT: vmovaps %ymm4, 96(%r8)
5120 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5121 ; AVX2-NEXT: vmovaps %ymm0, 32(%r8)
5122 ; AVX2-NEXT: addq $1944, %rsp # imm = 0x798
5123 ; AVX2-NEXT: vzeroupper
5126 ; AVX2-FP-LABEL: load_i32_stride4_vf64:
5128 ; AVX2-FP-NEXT: subq $1944, %rsp # imm = 0x798
5129 ; AVX2-FP-NEXT: vmovaps 704(%rdi), %ymm13
5130 ; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm8
5131 ; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5132 ; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm4
5133 ; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm15
5134 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm9
5135 ; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5136 ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm10
5137 ; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm14
5138 ; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0,4,0,4,0,4,0,4]
5139 ; AVX2-FP-NEXT: vpermps %ymm14, %ymm2, %ymm0
5140 ; AVX2-FP-NEXT: vpermps %ymm10, %ymm2, %ymm1
5141 ; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5142 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5143 ; AVX2-FP-NEXT: vmovaps 144(%rdi), %xmm3
5144 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5145 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %xmm1
5146 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5147 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
5148 ; AVX2-FP-NEXT: vpermps %ymm9, %ymm2, %ymm3
5149 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
5150 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5151 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5152 ; AVX2-FP-NEXT: vpermps %ymm15, %ymm2, %ymm0
5153 ; AVX2-FP-NEXT: vpermps %ymm4, %ymm2, %ymm1
5154 ; AVX2-FP-NEXT: vmovaps %ymm4, %ymm9
5155 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5156 ; AVX2-FP-NEXT: vmovaps 400(%rdi), %xmm3
5157 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5158 ; AVX2-FP-NEXT: vmovaps 384(%rdi), %xmm1
5159 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5160 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
5161 ; AVX2-FP-NEXT: vpermps %ymm8, %ymm2, %ymm3
5162 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
5163 ; AVX2-FP-NEXT: vmovaps 736(%rdi), %ymm11
5164 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5165 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5166 ; AVX2-FP-NEXT: vpermps %ymm11, %ymm2, %ymm0
5167 ; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5168 ; AVX2-FP-NEXT: vpermps %ymm13, %ymm2, %ymm1
5169 ; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5170 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5171 ; AVX2-FP-NEXT: vmovaps 672(%rdi), %ymm4
5172 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5173 ; AVX2-FP-NEXT: vmovaps 656(%rdi), %xmm3
5174 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5175 ; AVX2-FP-NEXT: vmovaps 640(%rdi), %xmm1
5176 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5177 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
5178 ; AVX2-FP-NEXT: vpermps %ymm4, %ymm2, %ymm3
5179 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
5180 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5181 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5182 ; AVX2-FP-NEXT: vmovaps 960(%rdi), %ymm3
5183 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5184 ; AVX2-FP-NEXT: vmovaps 992(%rdi), %ymm8
5185 ; AVX2-FP-NEXT: vpermps %ymm8, %ymm2, %ymm0
5186 ; AVX2-FP-NEXT: vpermps %ymm3, %ymm2, %ymm1
5187 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5188 ; AVX2-FP-NEXT: vmovaps 928(%rdi), %ymm4
5189 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5190 ; AVX2-FP-NEXT: vmovaps 912(%rdi), %xmm3
5191 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5192 ; AVX2-FP-NEXT: vmovaps 896(%rdi), %xmm1
5193 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5194 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
5195 ; AVX2-FP-NEXT: vpermps %ymm4, %ymm2, %ymm3
5196 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
5197 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5198 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5199 ; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm3
5200 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5201 ; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm0
5202 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5203 ; AVX2-FP-NEXT: vpermps %ymm0, %ymm2, %ymm0
5204 ; AVX2-FP-NEXT: vpermps %ymm3, %ymm2, %ymm1
5205 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5206 ; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm4
5207 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5208 ; AVX2-FP-NEXT: vmovaps 272(%rdi), %xmm3
5209 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5210 ; AVX2-FP-NEXT: vmovaps 256(%rdi), %xmm1
5211 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5212 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
5213 ; AVX2-FP-NEXT: vpermps %ymm4, %ymm2, %ymm3
5214 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
5215 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5216 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5217 ; AVX2-FP-NEXT: vmovaps 576(%rdi), %ymm3
5218 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5219 ; AVX2-FP-NEXT: vmovaps 608(%rdi), %ymm0
5220 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5221 ; AVX2-FP-NEXT: vpermps %ymm0, %ymm2, %ymm0
5222 ; AVX2-FP-NEXT: vpermps %ymm3, %ymm2, %ymm1
5223 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5224 ; AVX2-FP-NEXT: vmovaps 544(%rdi), %ymm4
5225 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5226 ; AVX2-FP-NEXT: vmovaps 528(%rdi), %xmm3
5227 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5228 ; AVX2-FP-NEXT: vmovaps 512(%rdi), %xmm1
5229 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5230 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
5231 ; AVX2-FP-NEXT: vpermps %ymm4, %ymm2, %ymm3
5232 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
5233 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5234 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5235 ; AVX2-FP-NEXT: vmovaps 832(%rdi), %ymm3
5236 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5237 ; AVX2-FP-NEXT: vmovaps 864(%rdi), %ymm0
5238 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5239 ; AVX2-FP-NEXT: vpermps %ymm0, %ymm2, %ymm0
5240 ; AVX2-FP-NEXT: vpermps %ymm3, %ymm2, %ymm1
5241 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5242 ; AVX2-FP-NEXT: vmovaps 800(%rdi), %ymm4
5243 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5244 ; AVX2-FP-NEXT: vmovaps 784(%rdi), %xmm3
5245 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5246 ; AVX2-FP-NEXT: vmovaps 768(%rdi), %xmm1
5247 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5248 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
5249 ; AVX2-FP-NEXT: vpermps %ymm4, %ymm2, %ymm3
5250 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
5251 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5252 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5253 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm1
5254 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm7
5255 ; AVX2-FP-NEXT: vpermps %ymm7, %ymm2, %ymm0
5256 ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5257 ; AVX2-FP-NEXT: vpermps %ymm1, %ymm2, %ymm4
5258 ; AVX2-FP-NEXT: vmovaps %ymm1, %ymm3
5259 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5260 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
5261 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm1
5262 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5263 ; AVX2-FP-NEXT: vpermps %ymm1, %ymm2, %ymm2
5264 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm1
5265 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5266 ; AVX2-FP-NEXT: vmovaps 16(%rdi), %xmm4
5267 ; AVX2-FP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5268 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
5269 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
5270 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
5271 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5272 ; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,5,1,5,1,5,1,5]
5273 ; AVX2-FP-NEXT: vpermps %ymm14, %ymm2, %ymm0
5274 ; AVX2-FP-NEXT: vpermps %ymm10, %ymm2, %ymm4
5275 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
5276 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm1
5277 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5278 ; AVX2-FP-NEXT: vmovaps 176(%rdi), %xmm5
5279 ; AVX2-FP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5280 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %xmm4
5281 ; AVX2-FP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5282 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
5283 ; AVX2-FP-NEXT: vpermps %ymm1, %ymm2, %ymm5
5284 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
5285 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
5286 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5287 ; AVX2-FP-NEXT: vpermps %ymm15, %ymm2, %ymm0
5288 ; AVX2-FP-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5289 ; AVX2-FP-NEXT: vpermps %ymm9, %ymm2, %ymm4
5290 ; AVX2-FP-NEXT: vmovaps %ymm9, %ymm10
5291 ; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5292 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
5293 ; AVX2-FP-NEXT: vmovaps 384(%rdi), %ymm1
5294 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5295 ; AVX2-FP-NEXT: vmovaps 432(%rdi), %xmm5
5296 ; AVX2-FP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5297 ; AVX2-FP-NEXT: vmovaps 416(%rdi), %xmm4
5298 ; AVX2-FP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5299 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
5300 ; AVX2-FP-NEXT: vpermps %ymm1, %ymm2, %ymm5
5301 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
5302 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
5303 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5304 ; AVX2-FP-NEXT: vpermps %ymm11, %ymm2, %ymm0
5305 ; AVX2-FP-NEXT: vpermps %ymm13, %ymm2, %ymm4
5306 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
5307 ; AVX2-FP-NEXT: vmovaps 640(%rdi), %ymm1
5308 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5309 ; AVX2-FP-NEXT: vmovaps 688(%rdi), %xmm5
5310 ; AVX2-FP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5311 ; AVX2-FP-NEXT: vmovaps 672(%rdi), %xmm4
5312 ; AVX2-FP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5313 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
5314 ; AVX2-FP-NEXT: vpermps %ymm1, %ymm2, %ymm5
5315 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
5316 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
5317 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5318 ; AVX2-FP-NEXT: vpermps %ymm8, %ymm2, %ymm0
5319 ; AVX2-FP-NEXT: vmovaps %ymm8, %ymm11
5320 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
5321 ; AVX2-FP-NEXT: vpermps %ymm8, %ymm2, %ymm4
5322 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
5323 ; AVX2-FP-NEXT: vmovaps 896(%rdi), %ymm1
5324 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5325 ; AVX2-FP-NEXT: vmovaps 944(%rdi), %xmm5
5326 ; AVX2-FP-NEXT: vmovaps %xmm5, (%rsp) # 16-byte Spill
5327 ; AVX2-FP-NEXT: vmovaps 928(%rdi), %xmm4
5328 ; AVX2-FP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5329 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
5330 ; AVX2-FP-NEXT: vpermps %ymm1, %ymm2, %ymm5
5331 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
5332 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
5333 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5334 ; AVX2-FP-NEXT: vpermps %ymm7, %ymm2, %ymm0
5335 ; AVX2-FP-NEXT: vpermps %ymm3, %ymm2, %ymm4
5336 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
5337 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm1
5338 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5339 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm3
5340 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5341 ; AVX2-FP-NEXT: vmovaps 48(%rdi), %xmm4
5342 ; AVX2-FP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5343 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
5344 ; AVX2-FP-NEXT: vpermps %ymm1, %ymm2, %ymm5
5345 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
5346 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
5347 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5348 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
5349 ; AVX2-FP-NEXT: vpermps %ymm6, %ymm2, %ymm0
5350 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
5351 ; AVX2-FP-NEXT: vpermps %ymm7, %ymm2, %ymm4
5352 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
5353 ; AVX2-FP-NEXT: vmovaps 256(%rdi), %ymm1
5354 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5355 ; AVX2-FP-NEXT: vmovaps 304(%rdi), %xmm3
5356 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5357 ; AVX2-FP-NEXT: vmovaps 288(%rdi), %xmm4
5358 ; AVX2-FP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5359 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
5360 ; AVX2-FP-NEXT: vpermps %ymm1, %ymm2, %ymm5
5361 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
5362 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
5363 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5364 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
5365 ; AVX2-FP-NEXT: vpermps %ymm13, %ymm2, %ymm0
5366 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
5367 ; AVX2-FP-NEXT: vpermps %ymm12, %ymm2, %ymm4
5368 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
5369 ; AVX2-FP-NEXT: vmovaps 512(%rdi), %ymm1
5370 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5371 ; AVX2-FP-NEXT: vmovaps 560(%rdi), %xmm3
5372 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5373 ; AVX2-FP-NEXT: vmovaps 544(%rdi), %xmm4
5374 ; AVX2-FP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5375 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
5376 ; AVX2-FP-NEXT: vpermps %ymm1, %ymm2, %ymm5
5377 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
5378 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
5379 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5380 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5381 ; AVX2-FP-NEXT: vpermps %ymm3, %ymm2, %ymm0
5382 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5383 ; AVX2-FP-NEXT: vpermps %ymm1, %ymm2, %ymm4
5384 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
5385 ; AVX2-FP-NEXT: vmovaps 768(%rdi), %ymm4
5386 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5387 ; AVX2-FP-NEXT: vpermps %ymm4, %ymm2, %ymm2
5388 ; AVX2-FP-NEXT: vmovaps 816(%rdi), %xmm5
5389 ; AVX2-FP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5390 ; AVX2-FP-NEXT: vmovaps 800(%rdi), %xmm4
5391 ; AVX2-FP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5392 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
5393 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
5394 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
5395 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5396 ; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,6,2,6,2,6,2,6]
5397 ; AVX2-FP-NEXT: vpermps %ymm14, %ymm0, %ymm2
5398 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5399 ; AVX2-FP-NEXT: vpermps %ymm5, %ymm0, %ymm4
5400 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
5401 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5402 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
5403 ; AVX2-FP-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
5404 ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
5405 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,3]
5406 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
5407 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5408 ; AVX2-FP-NEXT: vpermps %ymm15, %ymm0, %ymm2
5409 ; AVX2-FP-NEXT: vpermps %ymm10, %ymm0, %ymm4
5410 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
5411 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5412 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
5413 ; AVX2-FP-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
5414 ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
5415 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,3]
5416 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
5417 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5418 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
5419 ; AVX2-FP-NEXT: vpermps %ymm15, %ymm0, %ymm2
5420 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
5421 ; AVX2-FP-NEXT: vpermps %ymm10, %ymm0, %ymm4
5422 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
5423 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5424 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
5425 ; AVX2-FP-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
5426 ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
5427 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,3]
5428 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
5429 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5430 ; AVX2-FP-NEXT: vpermps %ymm11, %ymm0, %ymm2
5431 ; AVX2-FP-NEXT: vpermps %ymm8, %ymm0, %ymm4
5432 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
5433 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5434 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
5435 ; AVX2-FP-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
5436 ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
5437 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,3]
5438 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
5439 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5440 ; AVX2-FP-NEXT: vpermps %ymm6, %ymm0, %ymm2
5441 ; AVX2-FP-NEXT: vpermps %ymm7, %ymm0, %ymm4
5442 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
5443 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5444 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
5445 ; AVX2-FP-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
5446 ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
5447 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,3]
5448 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
5449 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5450 ; AVX2-FP-NEXT: vpermps %ymm13, %ymm0, %ymm2
5451 ; AVX2-FP-NEXT: vpermps %ymm12, %ymm0, %ymm4
5452 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
5453 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5454 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
5455 ; AVX2-FP-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
5456 ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
5457 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,3]
5458 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
5459 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5460 ; AVX2-FP-NEXT: vpermps %ymm3, %ymm0, %ymm2
5461 ; AVX2-FP-NEXT: vpermps %ymm1, %ymm0, %ymm4
5462 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
5463 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5464 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
5465 ; AVX2-FP-NEXT: # xmm4 = xmm1[2],mem[2],xmm1[3],mem[3]
5466 ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
5467 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,3]
5468 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7]
5469 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5470 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5471 ; AVX2-FP-NEXT: vpermps %ymm1, %ymm0, %ymm2
5472 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5473 ; AVX2-FP-NEXT: vpermps %ymm3, %ymm0, %ymm4
5474 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
5475 ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
5476 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5477 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
5478 ; AVX2-FP-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
5479 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
5480 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm2[4,5,6,7]
5481 ; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3,7,3,7,3,7,3,7]
5482 ; AVX2-FP-NEXT: vpermps %ymm14, %ymm0, %ymm2
5483 ; AVX2-FP-NEXT: vpermps %ymm5, %ymm0, %ymm4
5484 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
5485 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5486 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
5487 ; AVX2-FP-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
5488 ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
5489 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm14[0,1],xmm4[2,3]
5490 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
5491 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5492 ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
5493 ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
5494 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
5495 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5496 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
5497 ; AVX2-FP-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
5498 ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
5499 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm14[0,1],xmm4[2,3]
5500 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm2[4,5,6,7]
5501 ; AVX2-FP-NEXT: vpermps %ymm15, %ymm0, %ymm2
5502 ; AVX2-FP-NEXT: vpermps %ymm10, %ymm0, %ymm14
5503 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5],ymm2[6,7]
5504 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5505 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload
5506 ; AVX2-FP-NEXT: # xmm14 = xmm5[2],mem[2],xmm5[3],mem[3]
5507 ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
5508 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
5509 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm2[4,5,6,7]
5510 ; AVX2-FP-NEXT: vpermps %ymm11, %ymm0, %ymm11
5511 ; AVX2-FP-NEXT: vpermps %ymm8, %ymm0, %ymm14
5512 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1,2,3,4,5],ymm11[6,7]
5513 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5514 ; AVX2-FP-NEXT: vunpckhps (%rsp), %xmm5, %xmm14 # 16-byte Folded Reload
5515 ; AVX2-FP-NEXT: # xmm14 = xmm5[2],mem[2],xmm5[3],mem[3]
5516 ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
5517 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
5518 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7]
5519 ; AVX2-FP-NEXT: vpermps %ymm1, %ymm0, %ymm1
5520 ; AVX2-FP-NEXT: vpermps %ymm3, %ymm0, %ymm3
5521 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
5522 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5523 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
5524 ; AVX2-FP-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
5525 ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
5526 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0,1],xmm3[2,3]
5527 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
5528 ; AVX2-FP-NEXT: vpermps %ymm6, %ymm0, %ymm3
5529 ; AVX2-FP-NEXT: vpermps %ymm7, %ymm0, %ymm14
5530 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5],ymm3[6,7]
5531 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5532 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload
5533 ; AVX2-FP-NEXT: # xmm14 = xmm5[2],mem[2],xmm5[3],mem[3]
5534 ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
5535 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
5536 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm3[4,5,6,7]
5537 ; AVX2-FP-NEXT: vpermps %ymm13, %ymm0, %ymm10
5538 ; AVX2-FP-NEXT: vpermps %ymm12, %ymm0, %ymm14
5539 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm10[6,7]
5540 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5541 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm13 # 16-byte Folded Reload
5542 ; AVX2-FP-NEXT: # xmm13 = xmm5[2],mem[2],xmm5[3],mem[3]
5543 ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
5544 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3]
5545 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7]
5546 ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
5547 ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
5548 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
5549 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5550 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
5551 ; AVX2-FP-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
5552 ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
5553 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3]
5554 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
5555 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5556 ; AVX2-FP-NEXT: vmovaps %ymm5, 192(%rsi)
5557 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5558 ; AVX2-FP-NEXT: vmovaps %ymm5, 128(%rsi)
5559 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5560 ; AVX2-FP-NEXT: vmovaps %ymm5, 64(%rsi)
5561 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5562 ; AVX2-FP-NEXT: vmovaps %ymm5, (%rsi)
5563 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5564 ; AVX2-FP-NEXT: vmovaps %ymm5, 224(%rsi)
5565 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5566 ; AVX2-FP-NEXT: vmovaps %ymm5, 160(%rsi)
5567 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5568 ; AVX2-FP-NEXT: vmovaps %ymm5, 96(%rsi)
5569 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5570 ; AVX2-FP-NEXT: vmovaps %ymm5, 32(%rsi)
5571 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5572 ; AVX2-FP-NEXT: vmovaps %ymm5, 192(%rdx)
5573 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5574 ; AVX2-FP-NEXT: vmovaps %ymm5, 128(%rdx)
5575 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5576 ; AVX2-FP-NEXT: vmovaps %ymm5, 64(%rdx)
5577 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5578 ; AVX2-FP-NEXT: vmovaps %ymm5, (%rdx)
5579 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5580 ; AVX2-FP-NEXT: vmovaps %ymm5, 224(%rdx)
5581 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5582 ; AVX2-FP-NEXT: vmovaps %ymm5, 160(%rdx)
5583 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5584 ; AVX2-FP-NEXT: vmovaps %ymm5, 96(%rdx)
5585 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5586 ; AVX2-FP-NEXT: vmovaps %ymm5, 32(%rdx)
5587 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5588 ; AVX2-FP-NEXT: vmovaps %ymm5, 192(%rcx)
5589 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5590 ; AVX2-FP-NEXT: vmovaps %ymm5, 128(%rcx)
5591 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5592 ; AVX2-FP-NEXT: vmovaps %ymm5, 64(%rcx)
5593 ; AVX2-FP-NEXT: vmovaps %ymm9, (%rcx)
5594 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5595 ; AVX2-FP-NEXT: vmovaps %ymm5, 224(%rcx)
5596 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5597 ; AVX2-FP-NEXT: vmovaps %ymm5, 160(%rcx)
5598 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5599 ; AVX2-FP-NEXT: vmovaps %ymm5, 96(%rcx)
5600 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5601 ; AVX2-FP-NEXT: vmovaps %ymm5, 32(%rcx)
5602 ; AVX2-FP-NEXT: vmovaps %ymm0, 192(%r8)
5603 ; AVX2-FP-NEXT: vmovaps %ymm10, 128(%r8)
5604 ; AVX2-FP-NEXT: vmovaps %ymm3, 64(%r8)
5605 ; AVX2-FP-NEXT: vmovaps %ymm1, (%r8)
5606 ; AVX2-FP-NEXT: vmovaps %ymm11, 224(%r8)
5607 ; AVX2-FP-NEXT: vmovaps %ymm2, 160(%r8)
5608 ; AVX2-FP-NEXT: vmovaps %ymm4, 96(%r8)
5609 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5610 ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%r8)
5611 ; AVX2-FP-NEXT: addq $1944, %rsp # imm = 0x798
5612 ; AVX2-FP-NEXT: vzeroupper
5613 ; AVX2-FP-NEXT: retq
5615 ; AVX2-FCP-LABEL: load_i32_stride4_vf64:
5616 ; AVX2-FCP: # %bb.0:
5617 ; AVX2-FCP-NEXT: subq $1944, %rsp # imm = 0x798
5618 ; AVX2-FCP-NEXT: vmovaps 704(%rdi), %ymm13
5619 ; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm8
5620 ; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5621 ; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm4
5622 ; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm15
5623 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm9
5624 ; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5625 ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm10
5626 ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm14
5627 ; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0,4,0,4,0,4,0,4]
5628 ; AVX2-FCP-NEXT: vpermps %ymm14, %ymm2, %ymm0
5629 ; AVX2-FCP-NEXT: vpermps %ymm10, %ymm2, %ymm1
5630 ; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5631 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5632 ; AVX2-FCP-NEXT: vmovaps 144(%rdi), %xmm3
5633 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5634 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %xmm1
5635 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5636 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
5637 ; AVX2-FCP-NEXT: vpermps %ymm9, %ymm2, %ymm3
5638 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
5639 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5640 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5641 ; AVX2-FCP-NEXT: vpermps %ymm15, %ymm2, %ymm0
5642 ; AVX2-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm1
5643 ; AVX2-FCP-NEXT: vmovaps %ymm4, %ymm9
5644 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5645 ; AVX2-FCP-NEXT: vmovaps 400(%rdi), %xmm3
5646 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5647 ; AVX2-FCP-NEXT: vmovaps 384(%rdi), %xmm1
5648 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5649 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
5650 ; AVX2-FCP-NEXT: vpermps %ymm8, %ymm2, %ymm3
5651 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
5652 ; AVX2-FCP-NEXT: vmovaps 736(%rdi), %ymm11
5653 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5654 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5655 ; AVX2-FCP-NEXT: vpermps %ymm11, %ymm2, %ymm0
5656 ; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5657 ; AVX2-FCP-NEXT: vpermps %ymm13, %ymm2, %ymm1
5658 ; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5659 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5660 ; AVX2-FCP-NEXT: vmovaps 672(%rdi), %ymm4
5661 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5662 ; AVX2-FCP-NEXT: vmovaps 656(%rdi), %xmm3
5663 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5664 ; AVX2-FCP-NEXT: vmovaps 640(%rdi), %xmm1
5665 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5666 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
5667 ; AVX2-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm3
5668 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
5669 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5670 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5671 ; AVX2-FCP-NEXT: vmovaps 960(%rdi), %ymm3
5672 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5673 ; AVX2-FCP-NEXT: vmovaps 992(%rdi), %ymm8
5674 ; AVX2-FCP-NEXT: vpermps %ymm8, %ymm2, %ymm0
5675 ; AVX2-FCP-NEXT: vpermps %ymm3, %ymm2, %ymm1
5676 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5677 ; AVX2-FCP-NEXT: vmovaps 928(%rdi), %ymm4
5678 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5679 ; AVX2-FCP-NEXT: vmovaps 912(%rdi), %xmm3
5680 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5681 ; AVX2-FCP-NEXT: vmovaps 896(%rdi), %xmm1
5682 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5683 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
5684 ; AVX2-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm3
5685 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
5686 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5687 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5688 ; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm3
5689 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5690 ; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm0
5691 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5692 ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm2, %ymm0
5693 ; AVX2-FCP-NEXT: vpermps %ymm3, %ymm2, %ymm1
5694 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5695 ; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm4
5696 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5697 ; AVX2-FCP-NEXT: vmovaps 272(%rdi), %xmm3
5698 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5699 ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %xmm1
5700 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5701 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
5702 ; AVX2-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm3
5703 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
5704 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5705 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5706 ; AVX2-FCP-NEXT: vmovaps 576(%rdi), %ymm3
5707 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5708 ; AVX2-FCP-NEXT: vmovaps 608(%rdi), %ymm0
5709 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5710 ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm2, %ymm0
5711 ; AVX2-FCP-NEXT: vpermps %ymm3, %ymm2, %ymm1
5712 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5713 ; AVX2-FCP-NEXT: vmovaps 544(%rdi), %ymm4
5714 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5715 ; AVX2-FCP-NEXT: vmovaps 528(%rdi), %xmm3
5716 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5717 ; AVX2-FCP-NEXT: vmovaps 512(%rdi), %xmm1
5718 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5719 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
5720 ; AVX2-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm3
5721 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
5722 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5723 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5724 ; AVX2-FCP-NEXT: vmovaps 832(%rdi), %ymm3
5725 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5726 ; AVX2-FCP-NEXT: vmovaps 864(%rdi), %ymm0
5727 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5728 ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm2, %ymm0
5729 ; AVX2-FCP-NEXT: vpermps %ymm3, %ymm2, %ymm1
5730 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5731 ; AVX2-FCP-NEXT: vmovaps 800(%rdi), %ymm4
5732 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5733 ; AVX2-FCP-NEXT: vmovaps 784(%rdi), %xmm3
5734 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5735 ; AVX2-FCP-NEXT: vmovaps 768(%rdi), %xmm1
5736 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5737 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
5738 ; AVX2-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm3
5739 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
5740 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5741 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5742 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm1
5743 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm7
5744 ; AVX2-FCP-NEXT: vpermps %ymm7, %ymm2, %ymm0
5745 ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5746 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm4
5747 ; AVX2-FCP-NEXT: vmovaps %ymm1, %ymm3
5748 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5749 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
5750 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm1
5751 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5752 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2
5753 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm1
5754 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5755 ; AVX2-FCP-NEXT: vmovaps 16(%rdi), %xmm4
5756 ; AVX2-FCP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5757 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
5758 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
5759 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
5760 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5761 ; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,5,1,5,1,5,1,5]
5762 ; AVX2-FCP-NEXT: vpermps %ymm14, %ymm2, %ymm0
5763 ; AVX2-FCP-NEXT: vpermps %ymm10, %ymm2, %ymm4
5764 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
5765 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm1
5766 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5767 ; AVX2-FCP-NEXT: vmovaps 176(%rdi), %xmm5
5768 ; AVX2-FCP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5769 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %xmm4
5770 ; AVX2-FCP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5771 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
5772 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm5
5773 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
5774 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
5775 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5776 ; AVX2-FCP-NEXT: vpermps %ymm15, %ymm2, %ymm0
5777 ; AVX2-FCP-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5778 ; AVX2-FCP-NEXT: vpermps %ymm9, %ymm2, %ymm4
5779 ; AVX2-FCP-NEXT: vmovaps %ymm9, %ymm10
5780 ; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5781 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
5782 ; AVX2-FCP-NEXT: vmovaps 384(%rdi), %ymm1
5783 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5784 ; AVX2-FCP-NEXT: vmovaps 432(%rdi), %xmm5
5785 ; AVX2-FCP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5786 ; AVX2-FCP-NEXT: vmovaps 416(%rdi), %xmm4
5787 ; AVX2-FCP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5788 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
5789 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm5
5790 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
5791 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
5792 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5793 ; AVX2-FCP-NEXT: vpermps %ymm11, %ymm2, %ymm0
5794 ; AVX2-FCP-NEXT: vpermps %ymm13, %ymm2, %ymm4
5795 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
5796 ; AVX2-FCP-NEXT: vmovaps 640(%rdi), %ymm1
5797 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5798 ; AVX2-FCP-NEXT: vmovaps 688(%rdi), %xmm5
5799 ; AVX2-FCP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5800 ; AVX2-FCP-NEXT: vmovaps 672(%rdi), %xmm4
5801 ; AVX2-FCP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5802 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
5803 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm5
5804 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
5805 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
5806 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5807 ; AVX2-FCP-NEXT: vpermps %ymm8, %ymm2, %ymm0
5808 ; AVX2-FCP-NEXT: vmovaps %ymm8, %ymm11
5809 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
5810 ; AVX2-FCP-NEXT: vpermps %ymm8, %ymm2, %ymm4
5811 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
5812 ; AVX2-FCP-NEXT: vmovaps 896(%rdi), %ymm1
5813 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5814 ; AVX2-FCP-NEXT: vmovaps 944(%rdi), %xmm5
5815 ; AVX2-FCP-NEXT: vmovaps %xmm5, (%rsp) # 16-byte Spill
5816 ; AVX2-FCP-NEXT: vmovaps 928(%rdi), %xmm4
5817 ; AVX2-FCP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5818 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
5819 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm5
5820 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
5821 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
5822 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5823 ; AVX2-FCP-NEXT: vpermps %ymm7, %ymm2, %ymm0
5824 ; AVX2-FCP-NEXT: vpermps %ymm3, %ymm2, %ymm4
5825 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
5826 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm1
5827 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5828 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm3
5829 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5830 ; AVX2-FCP-NEXT: vmovaps 48(%rdi), %xmm4
5831 ; AVX2-FCP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5832 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
5833 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm5
5834 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
5835 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
5836 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5837 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
5838 ; AVX2-FCP-NEXT: vpermps %ymm6, %ymm2, %ymm0
5839 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
5840 ; AVX2-FCP-NEXT: vpermps %ymm7, %ymm2, %ymm4
5841 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
5842 ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm1
5843 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5844 ; AVX2-FCP-NEXT: vmovaps 304(%rdi), %xmm3
5845 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5846 ; AVX2-FCP-NEXT: vmovaps 288(%rdi), %xmm4
5847 ; AVX2-FCP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5848 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
5849 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm5
5850 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
5851 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
5852 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5853 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
5854 ; AVX2-FCP-NEXT: vpermps %ymm13, %ymm2, %ymm0
5855 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
5856 ; AVX2-FCP-NEXT: vpermps %ymm12, %ymm2, %ymm4
5857 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
5858 ; AVX2-FCP-NEXT: vmovaps 512(%rdi), %ymm1
5859 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5860 ; AVX2-FCP-NEXT: vmovaps 560(%rdi), %xmm3
5861 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5862 ; AVX2-FCP-NEXT: vmovaps 544(%rdi), %xmm4
5863 ; AVX2-FCP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5864 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
5865 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm5
5866 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
5867 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
5868 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5869 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5870 ; AVX2-FCP-NEXT: vpermps %ymm3, %ymm2, %ymm0
5871 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5872 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm4
5873 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
5874 ; AVX2-FCP-NEXT: vmovaps 768(%rdi), %ymm4
5875 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5876 ; AVX2-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm2
5877 ; AVX2-FCP-NEXT: vmovaps 816(%rdi), %xmm5
5878 ; AVX2-FCP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5879 ; AVX2-FCP-NEXT: vmovaps 800(%rdi), %xmm4
5880 ; AVX2-FCP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5881 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
5882 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
5883 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
5884 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5885 ; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,6,2,6,2,6,2,6]
5886 ; AVX2-FCP-NEXT: vpermps %ymm14, %ymm0, %ymm2
5887 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5888 ; AVX2-FCP-NEXT: vpermps %ymm5, %ymm0, %ymm4
5889 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
5890 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5891 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
5892 ; AVX2-FCP-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
5893 ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
5894 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,3]
5895 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
5896 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5897 ; AVX2-FCP-NEXT: vpermps %ymm15, %ymm0, %ymm2
5898 ; AVX2-FCP-NEXT: vpermps %ymm10, %ymm0, %ymm4
5899 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
5900 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5901 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
5902 ; AVX2-FCP-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
5903 ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
5904 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,3]
5905 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
5906 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5907 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
5908 ; AVX2-FCP-NEXT: vpermps %ymm15, %ymm0, %ymm2
5909 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
5910 ; AVX2-FCP-NEXT: vpermps %ymm10, %ymm0, %ymm4
5911 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
5912 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5913 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
5914 ; AVX2-FCP-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
5915 ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
5916 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,3]
5917 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
5918 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5919 ; AVX2-FCP-NEXT: vpermps %ymm11, %ymm0, %ymm2
5920 ; AVX2-FCP-NEXT: vpermps %ymm8, %ymm0, %ymm4
5921 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
5922 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5923 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
5924 ; AVX2-FCP-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
5925 ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
5926 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,3]
5927 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
5928 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5929 ; AVX2-FCP-NEXT: vpermps %ymm6, %ymm0, %ymm2
5930 ; AVX2-FCP-NEXT: vpermps %ymm7, %ymm0, %ymm4
5931 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
5932 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5933 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
5934 ; AVX2-FCP-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
5935 ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
5936 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,3]
5937 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
5938 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5939 ; AVX2-FCP-NEXT: vpermps %ymm13, %ymm0, %ymm2
5940 ; AVX2-FCP-NEXT: vpermps %ymm12, %ymm0, %ymm4
5941 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
5942 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5943 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
5944 ; AVX2-FCP-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
5945 ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
5946 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,3]
5947 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
5948 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5949 ; AVX2-FCP-NEXT: vpermps %ymm3, %ymm0, %ymm2
5950 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm4
5951 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
5952 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5953 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
5954 ; AVX2-FCP-NEXT: # xmm4 = xmm1[2],mem[2],xmm1[3],mem[3]
5955 ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
5956 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,3]
5957 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7]
5958 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5959 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5960 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm2
5961 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5962 ; AVX2-FCP-NEXT: vpermps %ymm3, %ymm0, %ymm4
5963 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
5964 ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
5965 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5966 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
5967 ; AVX2-FCP-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
5968 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
5969 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm2[4,5,6,7]
5970 ; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3,7,3,7,3,7,3,7]
5971 ; AVX2-FCP-NEXT: vpermps %ymm14, %ymm0, %ymm2
5972 ; AVX2-FCP-NEXT: vpermps %ymm5, %ymm0, %ymm4
5973 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
5974 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5975 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
5976 ; AVX2-FCP-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
5977 ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
5978 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm14[0,1],xmm4[2,3]
5979 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
5980 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5981 ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
5982 ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
5983 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
5984 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5985 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
5986 ; AVX2-FCP-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
5987 ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
5988 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm14[0,1],xmm4[2,3]
5989 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm2[4,5,6,7]
5990 ; AVX2-FCP-NEXT: vpermps %ymm15, %ymm0, %ymm2
5991 ; AVX2-FCP-NEXT: vpermps %ymm10, %ymm0, %ymm14
5992 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5],ymm2[6,7]
5993 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5994 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload
5995 ; AVX2-FCP-NEXT: # xmm14 = xmm5[2],mem[2],xmm5[3],mem[3]
5996 ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
5997 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
5998 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm2[4,5,6,7]
5999 ; AVX2-FCP-NEXT: vpermps %ymm11, %ymm0, %ymm11
6000 ; AVX2-FCP-NEXT: vpermps %ymm8, %ymm0, %ymm14
6001 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1,2,3,4,5],ymm11[6,7]
6002 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
6003 ; AVX2-FCP-NEXT: vunpckhps (%rsp), %xmm5, %xmm14 # 16-byte Folded Reload
6004 ; AVX2-FCP-NEXT: # xmm14 = xmm5[2],mem[2],xmm5[3],mem[3]
6005 ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
6006 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
6007 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7]
6008 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm1
6009 ; AVX2-FCP-NEXT: vpermps %ymm3, %ymm0, %ymm3
6010 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
6011 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
6012 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
6013 ; AVX2-FCP-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
6014 ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
6015 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0,1],xmm3[2,3]
6016 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
6017 ; AVX2-FCP-NEXT: vpermps %ymm6, %ymm0, %ymm3
6018 ; AVX2-FCP-NEXT: vpermps %ymm7, %ymm0, %ymm14
6019 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5],ymm3[6,7]
6020 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
6021 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload
6022 ; AVX2-FCP-NEXT: # xmm14 = xmm5[2],mem[2],xmm5[3],mem[3]
6023 ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
6024 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
6025 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm3[4,5,6,7]
6026 ; AVX2-FCP-NEXT: vpermps %ymm13, %ymm0, %ymm10
6027 ; AVX2-FCP-NEXT: vpermps %ymm12, %ymm0, %ymm14
6028 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm10[6,7]
6029 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
6030 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm13 # 16-byte Folded Reload
6031 ; AVX2-FCP-NEXT: # xmm13 = xmm5[2],mem[2],xmm5[3],mem[3]
6032 ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
6033 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3]
6034 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7]
6035 ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
6036 ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
6037 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
6038 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
6039 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
6040 ; AVX2-FCP-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
6041 ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
6042 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3]
6043 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
6044 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6045 ; AVX2-FCP-NEXT: vmovaps %ymm5, 192(%rsi)
6046 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6047 ; AVX2-FCP-NEXT: vmovaps %ymm5, 128(%rsi)
6048 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6049 ; AVX2-FCP-NEXT: vmovaps %ymm5, 64(%rsi)
6050 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6051 ; AVX2-FCP-NEXT: vmovaps %ymm5, (%rsi)
6052 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6053 ; AVX2-FCP-NEXT: vmovaps %ymm5, 224(%rsi)
6054 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6055 ; AVX2-FCP-NEXT: vmovaps %ymm5, 160(%rsi)
6056 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6057 ; AVX2-FCP-NEXT: vmovaps %ymm5, 96(%rsi)
6058 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6059 ; AVX2-FCP-NEXT: vmovaps %ymm5, 32(%rsi)
6060 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6061 ; AVX2-FCP-NEXT: vmovaps %ymm5, 192(%rdx)
6062 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6063 ; AVX2-FCP-NEXT: vmovaps %ymm5, 128(%rdx)
6064 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6065 ; AVX2-FCP-NEXT: vmovaps %ymm5, 64(%rdx)
6066 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6067 ; AVX2-FCP-NEXT: vmovaps %ymm5, (%rdx)
6068 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6069 ; AVX2-FCP-NEXT: vmovaps %ymm5, 224(%rdx)
6070 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6071 ; AVX2-FCP-NEXT: vmovaps %ymm5, 160(%rdx)
6072 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6073 ; AVX2-FCP-NEXT: vmovaps %ymm5, 96(%rdx)
6074 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6075 ; AVX2-FCP-NEXT: vmovaps %ymm5, 32(%rdx)
6076 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6077 ; AVX2-FCP-NEXT: vmovaps %ymm5, 192(%rcx)
6078 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6079 ; AVX2-FCP-NEXT: vmovaps %ymm5, 128(%rcx)
6080 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6081 ; AVX2-FCP-NEXT: vmovaps %ymm5, 64(%rcx)
6082 ; AVX2-FCP-NEXT: vmovaps %ymm9, (%rcx)
6083 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6084 ; AVX2-FCP-NEXT: vmovaps %ymm5, 224(%rcx)
6085 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6086 ; AVX2-FCP-NEXT: vmovaps %ymm5, 160(%rcx)
6087 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6088 ; AVX2-FCP-NEXT: vmovaps %ymm5, 96(%rcx)
6089 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6090 ; AVX2-FCP-NEXT: vmovaps %ymm5, 32(%rcx)
6091 ; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%r8)
6092 ; AVX2-FCP-NEXT: vmovaps %ymm10, 128(%r8)
6093 ; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%r8)
6094 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%r8)
6095 ; AVX2-FCP-NEXT: vmovaps %ymm11, 224(%r8)
6096 ; AVX2-FCP-NEXT: vmovaps %ymm2, 160(%r8)
6097 ; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%r8)
6098 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6099 ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%r8)
6100 ; AVX2-FCP-NEXT: addq $1944, %rsp # imm = 0x798
6101 ; AVX2-FCP-NEXT: vzeroupper
6102 ; AVX2-FCP-NEXT: retq
6104 ; AVX512-LABEL: load_i32_stride4_vf64:
6106 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
6107 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1
6108 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2
6109 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3
6110 ; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm4
6111 ; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm5
6112 ; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm10
6113 ; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm7
6114 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm14
6115 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm15
6116 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm17
6117 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm16
6118 ; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm8
6119 ; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm9
6120 ; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm12
6121 ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm11
6122 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28]
6123 ; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
6124 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm6
6125 ; AVX512-NEXT: vpermt2d %zmm12, %zmm19, %zmm6
6126 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm13
6127 ; AVX512-NEXT: vpermt2d %zmm8, %zmm19, %zmm13
6128 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7]
6129 ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm13
6130 ; AVX512-NEXT: vpermt2d %zmm17, %zmm19, %zmm13
6131 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm18
6132 ; AVX512-NEXT: vpermt2d %zmm14, %zmm19, %zmm18
6133 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7]
6134 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm18
6135 ; AVX512-NEXT: vpermt2d %zmm10, %zmm19, %zmm18
6136 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm20
6137 ; AVX512-NEXT: vpermt2d %zmm4, %zmm19, %zmm20
6138 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm20[0,1,2,3],zmm18[4,5,6,7]
6139 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm20
6140 ; AVX512-NEXT: vpermt2d %zmm3, %zmm19, %zmm20
6141 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm19
6142 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7]
6143 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29]
6144 ; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
6145 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm20
6146 ; AVX512-NEXT: vpermt2d %zmm12, %zmm21, %zmm20
6147 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm22
6148 ; AVX512-NEXT: vpermt2d %zmm8, %zmm21, %zmm22
6149 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7]
6150 ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm22
6151 ; AVX512-NEXT: vpermt2d %zmm17, %zmm21, %zmm22
6152 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm23
6153 ; AVX512-NEXT: vpermt2d %zmm14, %zmm21, %zmm23
6154 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7]
6155 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm23
6156 ; AVX512-NEXT: vpermt2d %zmm10, %zmm21, %zmm23
6157 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm24
6158 ; AVX512-NEXT: vpermt2d %zmm4, %zmm21, %zmm24
6159 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm24[0,1,2,3],zmm23[4,5,6,7]
6160 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm24
6161 ; AVX512-NEXT: vpermt2d %zmm3, %zmm21, %zmm24
6162 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm21
6163 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7]
6164 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30]
6165 ; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
6166 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm25
6167 ; AVX512-NEXT: vpermt2d %zmm12, %zmm24, %zmm25
6168 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm26
6169 ; AVX512-NEXT: vpermt2d %zmm8, %zmm24, %zmm26
6170 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7]
6171 ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm26
6172 ; AVX512-NEXT: vpermt2d %zmm17, %zmm24, %zmm26
6173 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm27
6174 ; AVX512-NEXT: vpermt2d %zmm14, %zmm24, %zmm27
6175 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7]
6176 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm27
6177 ; AVX512-NEXT: vpermt2d %zmm10, %zmm24, %zmm27
6178 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm28
6179 ; AVX512-NEXT: vpermt2d %zmm4, %zmm24, %zmm28
6180 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm28[0,1,2,3],zmm27[4,5,6,7]
6181 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm28
6182 ; AVX512-NEXT: vpermt2d %zmm3, %zmm24, %zmm28
6183 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm24
6184 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm28[4,5,6,7]
6185 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31]
6186 ; AVX512-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3]
6187 ; AVX512-NEXT: vpermt2d %zmm17, %zmm28, %zmm16
6188 ; AVX512-NEXT: vpermt2d %zmm14, %zmm28, %zmm15
6189 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7]
6190 ; AVX512-NEXT: vpermt2d %zmm10, %zmm28, %zmm7
6191 ; AVX512-NEXT: vpermt2d %zmm4, %zmm28, %zmm5
6192 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7]
6193 ; AVX512-NEXT: vpermt2d %zmm12, %zmm28, %zmm11
6194 ; AVX512-NEXT: vpermt2d %zmm8, %zmm28, %zmm9
6195 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7]
6196 ; AVX512-NEXT: vpermt2d %zmm3, %zmm28, %zmm2
6197 ; AVX512-NEXT: vpermt2d %zmm1, %zmm28, %zmm0
6198 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
6199 ; AVX512-NEXT: vmovdqa64 %zmm18, 192(%rsi)
6200 ; AVX512-NEXT: vmovdqa64 %zmm19, (%rsi)
6201 ; AVX512-NEXT: vmovdqa64 %zmm13, 64(%rsi)
6202 ; AVX512-NEXT: vmovdqa64 %zmm6, 128(%rsi)
6203 ; AVX512-NEXT: vmovdqa64 %zmm23, 192(%rdx)
6204 ; AVX512-NEXT: vmovdqa64 %zmm21, (%rdx)
6205 ; AVX512-NEXT: vmovdqa64 %zmm22, 64(%rdx)
6206 ; AVX512-NEXT: vmovdqa64 %zmm20, 128(%rdx)
6207 ; AVX512-NEXT: vmovdqa64 %zmm27, 192(%rcx)
6208 ; AVX512-NEXT: vmovdqa64 %zmm24, (%rcx)
6209 ; AVX512-NEXT: vmovdqa64 %zmm26, 64(%rcx)
6210 ; AVX512-NEXT: vmovdqa64 %zmm25, 128(%rcx)
6211 ; AVX512-NEXT: vmovdqa64 %zmm5, 128(%r8)
6212 ; AVX512-NEXT: vmovdqa64 %zmm4, 192(%r8)
6213 ; AVX512-NEXT: vmovdqa64 %zmm0, (%r8)
6214 ; AVX512-NEXT: vmovdqa64 %zmm14, 64(%r8)
6215 ; AVX512-NEXT: vzeroupper
6218 ; AVX512-FCP-LABEL: load_i32_stride4_vf64:
6219 ; AVX512-FCP: # %bb.0:
6220 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
6221 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
6222 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
6223 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
6224 ; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4
6225 ; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5
6226 ; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm10
6227 ; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm7
6228 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm14
6229 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15
6230 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17
6231 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm16
6232 ; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm8
6233 ; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm9
6234 ; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12
6235 ; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm11
6236 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28]
6237 ; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
6238 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm6
6239 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm19, %zmm6
6240 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm13
6241 ; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm19, %zmm13
6242 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7]
6243 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm13
6244 ; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm19, %zmm13
6245 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm18
6246 ; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm19, %zmm18
6247 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7]
6248 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm18
6249 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm19, %zmm18
6250 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm20
6251 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm19, %zmm20
6252 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm20[0,1,2,3],zmm18[4,5,6,7]
6253 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm20
6254 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm19, %zmm20
6255 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm19
6256 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7]
6257 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29]
6258 ; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
6259 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm20
6260 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm20
6261 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm22
6262 ; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm21, %zmm22
6263 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7]
6264 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm22
6265 ; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm21, %zmm22
6266 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm23
6267 ; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm21, %zmm23
6268 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7]
6269 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm23
6270 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm21, %zmm23
6271 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm24
6272 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm21, %zmm24
6273 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm24[0,1,2,3],zmm23[4,5,6,7]
6274 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm24
6275 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm21, %zmm24
6276 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm21
6277 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7]
6278 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30]
6279 ; AVX512-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
6280 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm25
6281 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm24, %zmm25
6282 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm26
6283 ; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm24, %zmm26
6284 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7]
6285 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm26
6286 ; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm24, %zmm26
6287 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm27
6288 ; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm24, %zmm27
6289 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7]
6290 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm27
6291 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm24, %zmm27
6292 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm28
6293 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm24, %zmm28
6294 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm28[0,1,2,3],zmm27[4,5,6,7]
6295 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm28
6296 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm24, %zmm28
6297 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm24
6298 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm28[4,5,6,7]
6299 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31]
6300 ; AVX512-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3]
6301 ; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm28, %zmm16
6302 ; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm28, %zmm15
6303 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7]
6304 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm28, %zmm7
6305 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm28, %zmm5
6306 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7]
6307 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm11
6308 ; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm28, %zmm9
6309 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7]
6310 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm28, %zmm2
6311 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm28, %zmm0
6312 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
6313 ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 192(%rsi)
6314 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rsi)
6315 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 64(%rsi)
6316 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 128(%rsi)
6317 ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 192(%rdx)
6318 ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%rdx)
6319 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 64(%rdx)
6320 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 128(%rdx)
6321 ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 192(%rcx)
6322 ; AVX512-FCP-NEXT: vmovdqa64 %zmm24, (%rcx)
6323 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 64(%rcx)
6324 ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 128(%rcx)
6325 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 128(%r8)
6326 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 192(%r8)
6327 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
6328 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 64(%r8)
6329 ; AVX512-FCP-NEXT: vzeroupper
6330 ; AVX512-FCP-NEXT: retq
6332 ; AVX512DQ-LABEL: load_i32_stride4_vf64:
6333 ; AVX512DQ: # %bb.0:
6334 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0
6335 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1
6336 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2
6337 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3
6338 ; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm4
6339 ; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm5
6340 ; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm10
6341 ; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm7
6342 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm14
6343 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm15
6344 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm17
6345 ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm16
6346 ; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm8
6347 ; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm9
6348 ; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm12
6349 ; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm11
6350 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28]
6351 ; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
6352 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm6
6353 ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm19, %zmm6
6354 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm13
6355 ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm19, %zmm13
6356 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7]
6357 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm13
6358 ; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm19, %zmm13
6359 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm18
6360 ; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm19, %zmm18
6361 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7]
6362 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm18
6363 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm19, %zmm18
6364 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm20
6365 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm19, %zmm20
6366 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm20[0,1,2,3],zmm18[4,5,6,7]
6367 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm20
6368 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm19, %zmm20
6369 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm19
6370 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7]
6371 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29]
6372 ; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
6373 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm20
6374 ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm21, %zmm20
6375 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm22
6376 ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm21, %zmm22
6377 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7]
6378 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm22
6379 ; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm21, %zmm22
6380 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm23
6381 ; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm21, %zmm23
6382 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7]
6383 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm23
6384 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm21, %zmm23
6385 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm24
6386 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm21, %zmm24
6387 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm24[0,1,2,3],zmm23[4,5,6,7]
6388 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm24
6389 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm21, %zmm24
6390 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm21
6391 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7]
6392 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30]
6393 ; AVX512DQ-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
6394 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm25
6395 ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm24, %zmm25
6396 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm26
6397 ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm24, %zmm26
6398 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7]
6399 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm26
6400 ; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm24, %zmm26
6401 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm27
6402 ; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm24, %zmm27
6403 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7]
6404 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm27
6405 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm24, %zmm27
6406 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm28
6407 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm24, %zmm28
6408 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm28[0,1,2,3],zmm27[4,5,6,7]
6409 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm28
6410 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm24, %zmm28
6411 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm24
6412 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm28[4,5,6,7]
6413 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31]
6414 ; AVX512DQ-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3]
6415 ; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm28, %zmm16
6416 ; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm28, %zmm15
6417 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7]
6418 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm28, %zmm7
6419 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm28, %zmm5
6420 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7]
6421 ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm28, %zmm11
6422 ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm28, %zmm9
6423 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7]
6424 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm28, %zmm2
6425 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm28, %zmm0
6426 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
6427 ; AVX512DQ-NEXT: vmovdqa64 %zmm18, 192(%rsi)
6428 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rsi)
6429 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 64(%rsi)
6430 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rsi)
6431 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, 192(%rdx)
6432 ; AVX512DQ-NEXT: vmovdqa64 %zmm21, (%rdx)
6433 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, 64(%rdx)
6434 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, 128(%rdx)
6435 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, 192(%rcx)
6436 ; AVX512DQ-NEXT: vmovdqa64 %zmm24, (%rcx)
6437 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, 64(%rcx)
6438 ; AVX512DQ-NEXT: vmovdqa64 %zmm25, 128(%rcx)
6439 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 128(%r8)
6440 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 192(%r8)
6441 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r8)
6442 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, 64(%r8)
6443 ; AVX512DQ-NEXT: vzeroupper
6444 ; AVX512DQ-NEXT: retq
6446 ; AVX512DQ-FCP-LABEL: load_i32_stride4_vf64:
6447 ; AVX512DQ-FCP: # %bb.0:
6448 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
6449 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
6450 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
6451 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
6452 ; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4
6453 ; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5
6454 ; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm10
6455 ; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm7
6456 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm14
6457 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15
6458 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17
6459 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm16
6460 ; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm8
6461 ; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm9
6462 ; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12
6463 ; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm11
6464 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28]
6465 ; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
6466 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm6
6467 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm19, %zmm6
6468 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm13
6469 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm19, %zmm13
6470 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7]
6471 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm13
6472 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm19, %zmm13
6473 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm18
6474 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm19, %zmm18
6475 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7]
6476 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm18
6477 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm19, %zmm18
6478 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm20
6479 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm19, %zmm20
6480 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm20[0,1,2,3],zmm18[4,5,6,7]
6481 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm20
6482 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm19, %zmm20
6483 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm19
6484 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7]
6485 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29]
6486 ; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
6487 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm20
6488 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm20
6489 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm22
6490 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm21, %zmm22
6491 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7]
6492 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm22
6493 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm21, %zmm22
6494 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm23
6495 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm21, %zmm23
6496 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7]
6497 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm23
6498 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm21, %zmm23
6499 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm24
6500 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm21, %zmm24
6501 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm24[0,1,2,3],zmm23[4,5,6,7]
6502 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm24
6503 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm21, %zmm24
6504 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm21
6505 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7]
6506 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30]
6507 ; AVX512DQ-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
6508 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm25
6509 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm24, %zmm25
6510 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm26
6511 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm24, %zmm26
6512 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7]
6513 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm26
6514 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm24, %zmm26
6515 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm27
6516 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm24, %zmm27
6517 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7]
6518 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm27
6519 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm24, %zmm27
6520 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm28
6521 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm24, %zmm28
6522 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm28[0,1,2,3],zmm27[4,5,6,7]
6523 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm28
6524 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm24, %zmm28
6525 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm24
6526 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm28[4,5,6,7]
6527 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31]
6528 ; AVX512DQ-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3]
6529 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm28, %zmm16
6530 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm28, %zmm15
6531 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7]
6532 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm28, %zmm7
6533 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm28, %zmm5
6534 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7]
6535 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm11
6536 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm28, %zmm9
6537 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7]
6538 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm28, %zmm2
6539 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm28, %zmm0
6540 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
6541 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 192(%rsi)
6542 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rsi)
6543 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 64(%rsi)
6544 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 128(%rsi)
6545 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 192(%rdx)
6546 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, (%rdx)
6547 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 64(%rdx)
6548 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 128(%rdx)
6549 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 192(%rcx)
6550 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, (%rcx)
6551 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 64(%rcx)
6552 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 128(%rcx)
6553 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 128(%r8)
6554 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 192(%r8)
6555 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
6556 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 64(%r8)
6557 ; AVX512DQ-FCP-NEXT: vzeroupper
6558 ; AVX512DQ-FCP-NEXT: retq
6560 ; AVX512BW-LABEL: load_i32_stride4_vf64:
6561 ; AVX512BW: # %bb.0:
6562 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
6563 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
6564 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2
6565 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3
6566 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm4
6567 ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm5
6568 ; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm10
6569 ; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm7
6570 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm14
6571 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm15
6572 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm17
6573 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm16
6574 ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm8
6575 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm9
6576 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm12
6577 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm11
6578 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28]
6579 ; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
6580 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm6
6581 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm19, %zmm6
6582 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13
6583 ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm19, %zmm13
6584 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7]
6585 ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm13
6586 ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm19, %zmm13
6587 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm18
6588 ; AVX512BW-NEXT: vpermt2d %zmm14, %zmm19, %zmm18
6589 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7]
6590 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm18
6591 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm19, %zmm18
6592 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm20
6593 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm19, %zmm20
6594 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm20[0,1,2,3],zmm18[4,5,6,7]
6595 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm20
6596 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm19, %zmm20
6597 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm19
6598 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7]
6599 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29]
6600 ; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
6601 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm20
6602 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm21, %zmm20
6603 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm22
6604 ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm21, %zmm22
6605 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7]
6606 ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm22
6607 ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm21, %zmm22
6608 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm23
6609 ; AVX512BW-NEXT: vpermt2d %zmm14, %zmm21, %zmm23
6610 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7]
6611 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm23
6612 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm21, %zmm23
6613 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm24
6614 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm21, %zmm24
6615 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm24[0,1,2,3],zmm23[4,5,6,7]
6616 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm24
6617 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm21, %zmm24
6618 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm21
6619 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7]
6620 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30]
6621 ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
6622 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm25
6623 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm24, %zmm25
6624 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm26
6625 ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm24, %zmm26
6626 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7]
6627 ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm26
6628 ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm24, %zmm26
6629 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm27
6630 ; AVX512BW-NEXT: vpermt2d %zmm14, %zmm24, %zmm27
6631 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7]
6632 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm27
6633 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm24, %zmm27
6634 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm28
6635 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm24, %zmm28
6636 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm28[0,1,2,3],zmm27[4,5,6,7]
6637 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28
6638 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm24, %zmm28
6639 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm24
6640 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm28[4,5,6,7]
6641 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31]
6642 ; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3]
6643 ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm28, %zmm16
6644 ; AVX512BW-NEXT: vpermt2d %zmm14, %zmm28, %zmm15
6645 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7]
6646 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm28, %zmm7
6647 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm28, %zmm5
6648 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7]
6649 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm28, %zmm11
6650 ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm28, %zmm9
6651 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7]
6652 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm28, %zmm2
6653 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm28, %zmm0
6654 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
6655 ; AVX512BW-NEXT: vmovdqa64 %zmm18, 192(%rsi)
6656 ; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rsi)
6657 ; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rsi)
6658 ; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rsi)
6659 ; AVX512BW-NEXT: vmovdqa64 %zmm23, 192(%rdx)
6660 ; AVX512BW-NEXT: vmovdqa64 %zmm21, (%rdx)
6661 ; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%rdx)
6662 ; AVX512BW-NEXT: vmovdqa64 %zmm20, 128(%rdx)
6663 ; AVX512BW-NEXT: vmovdqa64 %zmm27, 192(%rcx)
6664 ; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rcx)
6665 ; AVX512BW-NEXT: vmovdqa64 %zmm26, 64(%rcx)
6666 ; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%rcx)
6667 ; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%r8)
6668 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%r8)
6669 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r8)
6670 ; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%r8)
6671 ; AVX512BW-NEXT: vzeroupper
6672 ; AVX512BW-NEXT: retq
6674 ; AVX512BW-FCP-LABEL: load_i32_stride4_vf64:
6675 ; AVX512BW-FCP: # %bb.0:
6676 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
6677 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
6678 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
6679 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
6680 ; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4
6681 ; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5
6682 ; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm10
6683 ; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm7
6684 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm14
6685 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15
6686 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17
6687 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm16
6688 ; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm8
6689 ; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm9
6690 ; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12
6691 ; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm11
6692 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28]
6693 ; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
6694 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm6
6695 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm19, %zmm6
6696 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13
6697 ; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm19, %zmm13
6698 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7]
6699 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm13
6700 ; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm19, %zmm13
6701 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm18
6702 ; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm19, %zmm18
6703 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7]
6704 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm18
6705 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm19, %zmm18
6706 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm20
6707 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm19, %zmm20
6708 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm20[0,1,2,3],zmm18[4,5,6,7]
6709 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm20
6710 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm19, %zmm20
6711 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm19
6712 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7]
6713 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29]
6714 ; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
6715 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm20
6716 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm20
6717 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm22
6718 ; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm21, %zmm22
6719 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7]
6720 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm22
6721 ; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm21, %zmm22
6722 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm23
6723 ; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm21, %zmm23
6724 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7]
6725 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm23
6726 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm21, %zmm23
6727 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm24
6728 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm21, %zmm24
6729 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm24[0,1,2,3],zmm23[4,5,6,7]
6730 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm24
6731 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm21, %zmm24
6732 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm21
6733 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7]
6734 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30]
6735 ; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
6736 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm25
6737 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm24, %zmm25
6738 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm26
6739 ; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm24, %zmm26
6740 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7]
6741 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm26
6742 ; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm24, %zmm26
6743 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm27
6744 ; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm24, %zmm27
6745 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7]
6746 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm27
6747 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm24, %zmm27
6748 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm28
6749 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm24, %zmm28
6750 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm28[0,1,2,3],zmm27[4,5,6,7]
6751 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm28
6752 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm24, %zmm28
6753 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm24
6754 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm28[4,5,6,7]
6755 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31]
6756 ; AVX512BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3]
6757 ; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm28, %zmm16
6758 ; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm28, %zmm15
6759 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7]
6760 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm28, %zmm7
6761 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm28, %zmm5
6762 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7]
6763 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm11
6764 ; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm28, %zmm9
6765 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7]
6766 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm28, %zmm2
6767 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm28, %zmm0
6768 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
6769 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 192(%rsi)
6770 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, (%rsi)
6771 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rsi)
6772 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rsi)
6773 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 192(%rdx)
6774 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, (%rdx)
6775 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%rdx)
6776 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 128(%rdx)
6777 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 192(%rcx)
6778 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, (%rcx)
6779 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 64(%rcx)
6780 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 128(%rcx)
6781 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%r8)
6782 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%r8)
6783 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
6784 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%r8)
6785 ; AVX512BW-FCP-NEXT: vzeroupper
6786 ; AVX512BW-FCP-NEXT: retq
6788 ; AVX512DQ-BW-LABEL: load_i32_stride4_vf64:
6789 ; AVX512DQ-BW: # %bb.0:
6790 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
6791 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1
6792 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2
6793 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3
6794 ; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm4
6795 ; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm5
6796 ; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm10
6797 ; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm7
6798 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm14
6799 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm15
6800 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm17
6801 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm16
6802 ; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm8
6803 ; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm9
6804 ; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm12
6805 ; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm11
6806 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28]
6807 ; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
6808 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm6
6809 ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm19, %zmm6
6810 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm13
6811 ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm19, %zmm13
6812 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7]
6813 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm13
6814 ; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm19, %zmm13
6815 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm18
6816 ; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm19, %zmm18
6817 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7]
6818 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm18
6819 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm19, %zmm18
6820 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm20
6821 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm19, %zmm20
6822 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm20[0,1,2,3],zmm18[4,5,6,7]
6823 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm20
6824 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm19, %zmm20
6825 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm19
6826 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7]
6827 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29]
6828 ; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
6829 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm20
6830 ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm21, %zmm20
6831 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm22
6832 ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm21, %zmm22
6833 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7]
6834 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm22
6835 ; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm21, %zmm22
6836 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm23
6837 ; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm21, %zmm23
6838 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7]
6839 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm23
6840 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm21, %zmm23
6841 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm24
6842 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm21, %zmm24
6843 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm24[0,1,2,3],zmm23[4,5,6,7]
6844 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm24
6845 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm21, %zmm24
6846 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm21
6847 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7]
6848 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30]
6849 ; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
6850 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm25
6851 ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm24, %zmm25
6852 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm26
6853 ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm24, %zmm26
6854 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7]
6855 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm26
6856 ; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm24, %zmm26
6857 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm27
6858 ; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm24, %zmm27
6859 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7]
6860 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm27
6861 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm24, %zmm27
6862 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm28
6863 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm24, %zmm28
6864 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm28[0,1,2,3],zmm27[4,5,6,7]
6865 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm28
6866 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm24, %zmm28
6867 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm24
6868 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm28[4,5,6,7]
6869 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31]
6870 ; AVX512DQ-BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3]
6871 ; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm28, %zmm16
6872 ; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm28, %zmm15
6873 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7]
6874 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm28, %zmm7
6875 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm28, %zmm5
6876 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7]
6877 ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm28, %zmm11
6878 ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm28, %zmm9
6879 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7]
6880 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm28, %zmm2
6881 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm28, %zmm0
6882 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
6883 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 192(%rsi)
6884 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, (%rsi)
6885 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 64(%rsi)
6886 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 128(%rsi)
6887 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 192(%rdx)
6888 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, (%rdx)
6889 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 64(%rdx)
6890 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 128(%rdx)
6891 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 192(%rcx)
6892 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, (%rcx)
6893 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 64(%rcx)
6894 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 128(%rcx)
6895 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 128(%r8)
6896 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 192(%r8)
6897 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%r8)
6898 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 64(%r8)
6899 ; AVX512DQ-BW-NEXT: vzeroupper
6900 ; AVX512DQ-BW-NEXT: retq
6902 ; AVX512DQ-BW-FCP-LABEL: load_i32_stride4_vf64:
6903 ; AVX512DQ-BW-FCP: # %bb.0:
6904 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
6905 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
6906 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
6907 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
6908 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4
6909 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5
6910 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm10
6911 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm7
6912 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm14
6913 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15
6914 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17
6915 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm16
6916 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm8
6917 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm9
6918 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12
6919 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm11
6920 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28]
6921 ; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
6922 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm6
6923 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm19, %zmm6
6924 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13
6925 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm19, %zmm13
6926 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7]
6927 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm13
6928 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm19, %zmm13
6929 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm18
6930 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm19, %zmm18
6931 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7]
6932 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm18
6933 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm19, %zmm18
6934 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm20
6935 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm19, %zmm20
6936 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm20[0,1,2,3],zmm18[4,5,6,7]
6937 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm20
6938 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm19, %zmm20
6939 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm19
6940 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7]
6941 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29]
6942 ; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
6943 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm20
6944 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm20
6945 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm22
6946 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm21, %zmm22
6947 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7]
6948 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm22
6949 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm21, %zmm22
6950 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm23
6951 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm21, %zmm23
6952 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7]
6953 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm23
6954 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm21, %zmm23
6955 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm24
6956 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm21, %zmm24
6957 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm24[0,1,2,3],zmm23[4,5,6,7]
6958 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm24
6959 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm21, %zmm24
6960 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm21
6961 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7]
6962 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30]
6963 ; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
6964 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm25
6965 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm24, %zmm25
6966 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm26
6967 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm24, %zmm26
6968 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7]
6969 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm26
6970 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm24, %zmm26
6971 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm27
6972 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm24, %zmm27
6973 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7]
6974 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm27
6975 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm24, %zmm27
6976 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm28
6977 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm24, %zmm28
6978 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm28[0,1,2,3],zmm27[4,5,6,7]
6979 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm28
6980 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm24, %zmm28
6981 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm24
6982 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm28[4,5,6,7]
6983 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31]
6984 ; AVX512DQ-BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3]
6985 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm28, %zmm16
6986 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm28, %zmm15
6987 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7]
6988 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm28, %zmm7
6989 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm28, %zmm5
6990 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7]
6991 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm11
6992 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm28, %zmm9
6993 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7]
6994 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm28, %zmm2
6995 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm28, %zmm0
6996 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
6997 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 192(%rsi)
6998 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, (%rsi)
6999 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rsi)
7000 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rsi)
7001 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 192(%rdx)
7002 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, (%rdx)
7003 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%rdx)
7004 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 128(%rdx)
7005 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 192(%rcx)
7006 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, (%rcx)
7007 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 64(%rcx)
7008 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 128(%rcx)
7009 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%r8)
7010 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%r8)
7011 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
7012 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%r8)
7013 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
7014 ; AVX512DQ-BW-FCP-NEXT: retq
7015 %wide.vec = load <256 x i32>, ptr %in.vec, align 64
7016 %strided.vec0 = shufflevector <256 x i32> %wide.vec, <256 x i32> poison, <64 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60, i32 64, i32 68, i32 72, i32 76, i32 80, i32 84, i32 88, i32 92, i32 96, i32 100, i32 104, i32 108, i32 112, i32 116, i32 120, i32 124, i32 128, i32 132, i32 136, i32 140, i32 144, i32 148, i32 152, i32 156, i32 160, i32 164, i32 168, i32 172, i32 176, i32 180, i32 184, i32 188, i32 192, i32 196, i32 200, i32 204, i32 208, i32 212, i32 216, i32 220, i32 224, i32 228, i32 232, i32 236, i32 240, i32 244, i32 248, i32 252>
7017 %strided.vec1 = shufflevector <256 x i32> %wide.vec, <256 x i32> poison, <64 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61, i32 65, i32 69, i32 73, i32 77, i32 81, i32 85, i32 89, i32 93, i32 97, i32 101, i32 105, i32 109, i32 113, i32 117, i32 121, i32 125, i32 129, i32 133, i32 137, i32 141, i32 145, i32 149, i32 153, i32 157, i32 161, i32 165, i32 169, i32 173, i32 177, i32 181, i32 185, i32 189, i32 193, i32 197, i32 201, i32 205, i32 209, i32 213, i32 217, i32 221, i32 225, i32 229, i32 233, i32 237, i32 241, i32 245, i32 249, i32 253>
7018 %strided.vec2 = shufflevector <256 x i32> %wide.vec, <256 x i32> poison, <64 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62, i32 66, i32 70, i32 74, i32 78, i32 82, i32 86, i32 90, i32 94, i32 98, i32 102, i32 106, i32 110, i32 114, i32 118, i32 122, i32 126, i32 130, i32 134, i32 138, i32 142, i32 146, i32 150, i32 154, i32 158, i32 162, i32 166, i32 170, i32 174, i32 178, i32 182, i32 186, i32 190, i32 194, i32 198, i32 202, i32 206, i32 210, i32 214, i32 218, i32 222, i32 226, i32 230, i32 234, i32 238, i32 242, i32 246, i32 250, i32 254>
7019 %strided.vec3 = shufflevector <256 x i32> %wide.vec, <256 x i32> poison, <64 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63, i32 67, i32 71, i32 75, i32 79, i32 83, i32 87, i32 91, i32 95, i32 99, i32 103, i32 107, i32 111, i32 115, i32 119, i32 123, i32 127, i32 131, i32 135, i32 139, i32 143, i32 147, i32 151, i32 155, i32 159, i32 163, i32 167, i32 171, i32 175, i32 179, i32 183, i32 187, i32 191, i32 195, i32 199, i32 203, i32 207, i32 211, i32 215, i32 219, i32 223, i32 227, i32 231, i32 235, i32 239, i32 243, i32 247, i32 251, i32 255>
7020 store <64 x i32> %strided.vec0, ptr %out.vec0, align 64
7021 store <64 x i32> %strided.vec1, ptr %out.vec1, align 64
7022 store <64 x i32> %strided.vec2, ptr %out.vec2, align 64
7023 store <64 x i32> %strided.vec3, ptr %out.vec3, align 64