1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
16 ; These patterns are produced by LoopVectorizer for interleaved loads.
18 define void @load_i64_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind {
19 ; SSE-LABEL: load_i64_stride8_vf2:
21 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
22 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
23 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11
24 ; SSE-NEXT: movaps 112(%rdi), %xmm0
25 ; SSE-NEXT: movaps 96(%rdi), %xmm1
26 ; SSE-NEXT: movaps 80(%rdi), %xmm2
27 ; SSE-NEXT: movaps 64(%rdi), %xmm3
28 ; SSE-NEXT: movaps (%rdi), %xmm4
29 ; SSE-NEXT: movaps 16(%rdi), %xmm5
30 ; SSE-NEXT: movaps 32(%rdi), %xmm6
31 ; SSE-NEXT: movaps 48(%rdi), %xmm7
32 ; SSE-NEXT: movaps %xmm4, %xmm8
33 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm3[0]
34 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
35 ; SSE-NEXT: movaps %xmm5, %xmm3
36 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
37 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1]
38 ; SSE-NEXT: movaps %xmm6, %xmm2
39 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
40 ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1]
41 ; SSE-NEXT: movaps %xmm7, %xmm1
42 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
43 ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1]
44 ; SSE-NEXT: movaps %xmm8, (%rsi)
45 ; SSE-NEXT: movaps %xmm4, (%rdx)
46 ; SSE-NEXT: movaps %xmm3, (%rcx)
47 ; SSE-NEXT: movaps %xmm5, (%r8)
48 ; SSE-NEXT: movaps %xmm2, (%r9)
49 ; SSE-NEXT: movaps %xmm6, (%r11)
50 ; SSE-NEXT: movaps %xmm1, (%r10)
51 ; SSE-NEXT: movaps %xmm7, (%rax)
54 ; AVX-LABEL: load_i64_stride8_vf2:
56 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
57 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
58 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r11
59 ; AVX-NEXT: vmovaps 64(%rdi), %xmm0
60 ; AVX-NEXT: vmovaps (%rdi), %xmm1
61 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
62 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
63 ; AVX-NEXT: vmovaps 80(%rdi), %xmm1
64 ; AVX-NEXT: vmovaps 16(%rdi), %xmm3
65 ; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm1[0]
66 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm1[1]
67 ; AVX-NEXT: vmovaps 96(%rdi), %xmm3
68 ; AVX-NEXT: vmovaps 32(%rdi), %xmm5
69 ; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm3[0]
70 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm5[1],xmm3[1]
71 ; AVX-NEXT: vmovaps 112(%rdi), %xmm5
72 ; AVX-NEXT: vmovaps 48(%rdi), %xmm7
73 ; AVX-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm5[0]
74 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm7[1],xmm5[1]
75 ; AVX-NEXT: vmovaps %xmm2, (%rsi)
76 ; AVX-NEXT: vmovaps %xmm0, (%rdx)
77 ; AVX-NEXT: vmovaps %xmm4, (%rcx)
78 ; AVX-NEXT: vmovaps %xmm1, (%r8)
79 ; AVX-NEXT: vmovaps %xmm6, (%r9)
80 ; AVX-NEXT: vmovaps %xmm3, (%r11)
81 ; AVX-NEXT: vmovaps %xmm8, (%r10)
82 ; AVX-NEXT: vmovaps %xmm5, (%rax)
85 ; AVX2-LABEL: load_i64_stride8_vf2:
87 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
88 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
89 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
90 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm0
91 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm1
92 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm2
93 ; AVX2-NEXT: vmovaps (%rdi), %ymm3
94 ; AVX2-NEXT: vmovaps (%rdi), %xmm4
95 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm5
96 ; AVX2-NEXT: vmovaps 64(%rdi), %xmm6
97 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm7 = xmm4[0],xmm6[0]
98 ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1]
99 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
100 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
101 ; AVX2-NEXT: vmovaps 96(%rdi), %xmm3
102 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm8 = xmm5[0],xmm3[0]
103 ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm5[1],xmm3[1]
104 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
105 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
106 ; AVX2-NEXT: vmovaps %xmm7, (%rsi)
107 ; AVX2-NEXT: vmovaps %xmm4, (%rdx)
108 ; AVX2-NEXT: vextractf128 $1, %ymm6, (%rcx)
109 ; AVX2-NEXT: vextractf128 $1, %ymm2, (%r8)
110 ; AVX2-NEXT: vmovaps %xmm8, (%r9)
111 ; AVX2-NEXT: vmovaps %xmm3, (%r11)
112 ; AVX2-NEXT: vextractf128 $1, %ymm5, (%r10)
113 ; AVX2-NEXT: vextractf128 $1, %ymm0, (%rax)
114 ; AVX2-NEXT: vzeroupper
117 ; AVX2-FP-LABEL: load_i64_stride8_vf2:
119 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
120 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
121 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11
122 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm0
123 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm1
124 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm2
125 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm3
126 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm4
127 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm5
128 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm6
129 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm7 = xmm4[0],xmm6[0]
130 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1]
131 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
132 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
133 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm3
134 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm8 = xmm5[0],xmm3[0]
135 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm5[1],xmm3[1]
136 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
137 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
138 ; AVX2-FP-NEXT: vmovaps %xmm7, (%rsi)
139 ; AVX2-FP-NEXT: vmovaps %xmm4, (%rdx)
140 ; AVX2-FP-NEXT: vextractf128 $1, %ymm6, (%rcx)
141 ; AVX2-FP-NEXT: vextractf128 $1, %ymm2, (%r8)
142 ; AVX2-FP-NEXT: vmovaps %xmm8, (%r9)
143 ; AVX2-FP-NEXT: vmovaps %xmm3, (%r11)
144 ; AVX2-FP-NEXT: vextractf128 $1, %ymm5, (%r10)
145 ; AVX2-FP-NEXT: vextractf128 $1, %ymm0, (%rax)
146 ; AVX2-FP-NEXT: vzeroupper
149 ; AVX2-FCP-LABEL: load_i64_stride8_vf2:
151 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
152 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
153 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
154 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm0
155 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm1
156 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm2
157 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm3
158 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm4
159 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm5
160 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm6
161 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm7 = xmm4[0],xmm6[0]
162 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1]
163 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
164 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
165 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm3
166 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm8 = xmm5[0],xmm3[0]
167 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm5[1],xmm3[1]
168 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
169 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
170 ; AVX2-FCP-NEXT: vmovaps %xmm7, (%rsi)
171 ; AVX2-FCP-NEXT: vmovaps %xmm4, (%rdx)
172 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, (%rcx)
173 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, (%r8)
174 ; AVX2-FCP-NEXT: vmovaps %xmm8, (%r9)
175 ; AVX2-FCP-NEXT: vmovaps %xmm3, (%r11)
176 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm5, (%r10)
177 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, (%rax)
178 ; AVX2-FCP-NEXT: vzeroupper
179 ; AVX2-FCP-NEXT: retq
181 ; AVX512-LABEL: load_i64_stride8_vf2:
183 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
184 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
185 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
186 ; AVX512-NEXT: vmovaps 64(%rdi), %xmm0
187 ; AVX512-NEXT: vmovaps (%rdi), %xmm1
188 ; AVX512-NEXT: vmovaps 16(%rdi), %xmm2
189 ; AVX512-NEXT: vmovaps 32(%rdi), %xmm3
190 ; AVX512-NEXT: vmovaps 48(%rdi), %xmm4
191 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm0[0]
192 ; AVX512-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
193 ; AVX512-NEXT: vmovaps 80(%rdi), %xmm1
194 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm1[0]
195 ; AVX512-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
196 ; AVX512-NEXT: vmovaps 96(%rdi), %xmm2
197 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm7 = xmm3[0],xmm2[0]
198 ; AVX512-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1]
199 ; AVX512-NEXT: vmovaps 112(%rdi), %xmm3
200 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm8 = xmm4[0],xmm3[0]
201 ; AVX512-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1]
202 ; AVX512-NEXT: vmovaps %xmm5, (%rsi)
203 ; AVX512-NEXT: vmovaps %xmm0, (%rdx)
204 ; AVX512-NEXT: vmovaps %xmm6, (%rcx)
205 ; AVX512-NEXT: vmovaps %xmm1, (%r8)
206 ; AVX512-NEXT: vmovaps %xmm7, (%r9)
207 ; AVX512-NEXT: vmovaps %xmm2, (%r11)
208 ; AVX512-NEXT: vmovaps %xmm8, (%r10)
209 ; AVX512-NEXT: vmovaps %xmm3, (%rax)
212 ; AVX512-FCP-LABEL: load_i64_stride8_vf2:
213 ; AVX512-FCP: # %bb.0:
214 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
215 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
216 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
217 ; AVX512-FCP-NEXT: vmovaps 64(%rdi), %xmm0
218 ; AVX512-FCP-NEXT: vmovaps (%rdi), %xmm1
219 ; AVX512-FCP-NEXT: vmovaps 16(%rdi), %xmm2
220 ; AVX512-FCP-NEXT: vmovaps 32(%rdi), %xmm3
221 ; AVX512-FCP-NEXT: vmovaps 48(%rdi), %xmm4
222 ; AVX512-FCP-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm0[0]
223 ; AVX512-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
224 ; AVX512-FCP-NEXT: vmovaps 80(%rdi), %xmm1
225 ; AVX512-FCP-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm1[0]
226 ; AVX512-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
227 ; AVX512-FCP-NEXT: vmovaps 96(%rdi), %xmm2
228 ; AVX512-FCP-NEXT: vmovlhps {{.*#+}} xmm7 = xmm3[0],xmm2[0]
229 ; AVX512-FCP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1]
230 ; AVX512-FCP-NEXT: vmovaps 112(%rdi), %xmm3
231 ; AVX512-FCP-NEXT: vmovlhps {{.*#+}} xmm8 = xmm4[0],xmm3[0]
232 ; AVX512-FCP-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1]
233 ; AVX512-FCP-NEXT: vmovaps %xmm5, (%rsi)
234 ; AVX512-FCP-NEXT: vmovaps %xmm0, (%rdx)
235 ; AVX512-FCP-NEXT: vmovaps %xmm6, (%rcx)
236 ; AVX512-FCP-NEXT: vmovaps %xmm1, (%r8)
237 ; AVX512-FCP-NEXT: vmovaps %xmm7, (%r9)
238 ; AVX512-FCP-NEXT: vmovaps %xmm2, (%r11)
239 ; AVX512-FCP-NEXT: vmovaps %xmm8, (%r10)
240 ; AVX512-FCP-NEXT: vmovaps %xmm3, (%rax)
241 ; AVX512-FCP-NEXT: retq
243 ; AVX512DQ-LABEL: load_i64_stride8_vf2:
245 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
246 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
247 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11
248 ; AVX512DQ-NEXT: vmovaps 64(%rdi), %xmm0
249 ; AVX512DQ-NEXT: vmovaps (%rdi), %xmm1
250 ; AVX512DQ-NEXT: vmovaps 16(%rdi), %xmm2
251 ; AVX512DQ-NEXT: vmovaps 32(%rdi), %xmm3
252 ; AVX512DQ-NEXT: vmovaps 48(%rdi), %xmm4
253 ; AVX512DQ-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm0[0]
254 ; AVX512DQ-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
255 ; AVX512DQ-NEXT: vmovaps 80(%rdi), %xmm1
256 ; AVX512DQ-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm1[0]
257 ; AVX512DQ-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
258 ; AVX512DQ-NEXT: vmovaps 96(%rdi), %xmm2
259 ; AVX512DQ-NEXT: vmovlhps {{.*#+}} xmm7 = xmm3[0],xmm2[0]
260 ; AVX512DQ-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1]
261 ; AVX512DQ-NEXT: vmovaps 112(%rdi), %xmm3
262 ; AVX512DQ-NEXT: vmovlhps {{.*#+}} xmm8 = xmm4[0],xmm3[0]
263 ; AVX512DQ-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1]
264 ; AVX512DQ-NEXT: vmovaps %xmm5, (%rsi)
265 ; AVX512DQ-NEXT: vmovaps %xmm0, (%rdx)
266 ; AVX512DQ-NEXT: vmovaps %xmm6, (%rcx)
267 ; AVX512DQ-NEXT: vmovaps %xmm1, (%r8)
268 ; AVX512DQ-NEXT: vmovaps %xmm7, (%r9)
269 ; AVX512DQ-NEXT: vmovaps %xmm2, (%r11)
270 ; AVX512DQ-NEXT: vmovaps %xmm8, (%r10)
271 ; AVX512DQ-NEXT: vmovaps %xmm3, (%rax)
272 ; AVX512DQ-NEXT: retq
274 ; AVX512DQ-FCP-LABEL: load_i64_stride8_vf2:
275 ; AVX512DQ-FCP: # %bb.0:
276 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
277 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
278 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
279 ; AVX512DQ-FCP-NEXT: vmovaps 64(%rdi), %xmm0
280 ; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %xmm1
281 ; AVX512DQ-FCP-NEXT: vmovaps 16(%rdi), %xmm2
282 ; AVX512DQ-FCP-NEXT: vmovaps 32(%rdi), %xmm3
283 ; AVX512DQ-FCP-NEXT: vmovaps 48(%rdi), %xmm4
284 ; AVX512DQ-FCP-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm0[0]
285 ; AVX512DQ-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
286 ; AVX512DQ-FCP-NEXT: vmovaps 80(%rdi), %xmm1
287 ; AVX512DQ-FCP-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm1[0]
288 ; AVX512DQ-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
289 ; AVX512DQ-FCP-NEXT: vmovaps 96(%rdi), %xmm2
290 ; AVX512DQ-FCP-NEXT: vmovlhps {{.*#+}} xmm7 = xmm3[0],xmm2[0]
291 ; AVX512DQ-FCP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1]
292 ; AVX512DQ-FCP-NEXT: vmovaps 112(%rdi), %xmm3
293 ; AVX512DQ-FCP-NEXT: vmovlhps {{.*#+}} xmm8 = xmm4[0],xmm3[0]
294 ; AVX512DQ-FCP-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1]
295 ; AVX512DQ-FCP-NEXT: vmovaps %xmm5, (%rsi)
296 ; AVX512DQ-FCP-NEXT: vmovaps %xmm0, (%rdx)
297 ; AVX512DQ-FCP-NEXT: vmovaps %xmm6, (%rcx)
298 ; AVX512DQ-FCP-NEXT: vmovaps %xmm1, (%r8)
299 ; AVX512DQ-FCP-NEXT: vmovaps %xmm7, (%r9)
300 ; AVX512DQ-FCP-NEXT: vmovaps %xmm2, (%r11)
301 ; AVX512DQ-FCP-NEXT: vmovaps %xmm8, (%r10)
302 ; AVX512DQ-FCP-NEXT: vmovaps %xmm3, (%rax)
303 ; AVX512DQ-FCP-NEXT: retq
305 ; AVX512BW-LABEL: load_i64_stride8_vf2:
307 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
308 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
309 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
310 ; AVX512BW-NEXT: vmovaps 64(%rdi), %xmm0
311 ; AVX512BW-NEXT: vmovaps (%rdi), %xmm1
312 ; AVX512BW-NEXT: vmovaps 16(%rdi), %xmm2
313 ; AVX512BW-NEXT: vmovaps 32(%rdi), %xmm3
314 ; AVX512BW-NEXT: vmovaps 48(%rdi), %xmm4
315 ; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm0[0]
316 ; AVX512BW-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
317 ; AVX512BW-NEXT: vmovaps 80(%rdi), %xmm1
318 ; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm1[0]
319 ; AVX512BW-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
320 ; AVX512BW-NEXT: vmovaps 96(%rdi), %xmm2
321 ; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm7 = xmm3[0],xmm2[0]
322 ; AVX512BW-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1]
323 ; AVX512BW-NEXT: vmovaps 112(%rdi), %xmm3
324 ; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm8 = xmm4[0],xmm3[0]
325 ; AVX512BW-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1]
326 ; AVX512BW-NEXT: vmovaps %xmm5, (%rsi)
327 ; AVX512BW-NEXT: vmovaps %xmm0, (%rdx)
328 ; AVX512BW-NEXT: vmovaps %xmm6, (%rcx)
329 ; AVX512BW-NEXT: vmovaps %xmm1, (%r8)
330 ; AVX512BW-NEXT: vmovaps %xmm7, (%r9)
331 ; AVX512BW-NEXT: vmovaps %xmm2, (%r11)
332 ; AVX512BW-NEXT: vmovaps %xmm8, (%r10)
333 ; AVX512BW-NEXT: vmovaps %xmm3, (%rax)
334 ; AVX512BW-NEXT: retq
336 ; AVX512BW-FCP-LABEL: load_i64_stride8_vf2:
337 ; AVX512BW-FCP: # %bb.0:
338 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
339 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
340 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
341 ; AVX512BW-FCP-NEXT: vmovaps 64(%rdi), %xmm0
342 ; AVX512BW-FCP-NEXT: vmovaps (%rdi), %xmm1
343 ; AVX512BW-FCP-NEXT: vmovaps 16(%rdi), %xmm2
344 ; AVX512BW-FCP-NEXT: vmovaps 32(%rdi), %xmm3
345 ; AVX512BW-FCP-NEXT: vmovaps 48(%rdi), %xmm4
346 ; AVX512BW-FCP-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm0[0]
347 ; AVX512BW-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
348 ; AVX512BW-FCP-NEXT: vmovaps 80(%rdi), %xmm1
349 ; AVX512BW-FCP-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm1[0]
350 ; AVX512BW-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
351 ; AVX512BW-FCP-NEXT: vmovaps 96(%rdi), %xmm2
352 ; AVX512BW-FCP-NEXT: vmovlhps {{.*#+}} xmm7 = xmm3[0],xmm2[0]
353 ; AVX512BW-FCP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1]
354 ; AVX512BW-FCP-NEXT: vmovaps 112(%rdi), %xmm3
355 ; AVX512BW-FCP-NEXT: vmovlhps {{.*#+}} xmm8 = xmm4[0],xmm3[0]
356 ; AVX512BW-FCP-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1]
357 ; AVX512BW-FCP-NEXT: vmovaps %xmm5, (%rsi)
358 ; AVX512BW-FCP-NEXT: vmovaps %xmm0, (%rdx)
359 ; AVX512BW-FCP-NEXT: vmovaps %xmm6, (%rcx)
360 ; AVX512BW-FCP-NEXT: vmovaps %xmm1, (%r8)
361 ; AVX512BW-FCP-NEXT: vmovaps %xmm7, (%r9)
362 ; AVX512BW-FCP-NEXT: vmovaps %xmm2, (%r11)
363 ; AVX512BW-FCP-NEXT: vmovaps %xmm8, (%r10)
364 ; AVX512BW-FCP-NEXT: vmovaps %xmm3, (%rax)
365 ; AVX512BW-FCP-NEXT: retq
367 ; AVX512DQ-BW-LABEL: load_i64_stride8_vf2:
368 ; AVX512DQ-BW: # %bb.0:
369 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
370 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
371 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
372 ; AVX512DQ-BW-NEXT: vmovaps 64(%rdi), %xmm0
373 ; AVX512DQ-BW-NEXT: vmovaps (%rdi), %xmm1
374 ; AVX512DQ-BW-NEXT: vmovaps 16(%rdi), %xmm2
375 ; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %xmm3
376 ; AVX512DQ-BW-NEXT: vmovaps 48(%rdi), %xmm4
377 ; AVX512DQ-BW-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm0[0]
378 ; AVX512DQ-BW-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
379 ; AVX512DQ-BW-NEXT: vmovaps 80(%rdi), %xmm1
380 ; AVX512DQ-BW-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm1[0]
381 ; AVX512DQ-BW-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
382 ; AVX512DQ-BW-NEXT: vmovaps 96(%rdi), %xmm2
383 ; AVX512DQ-BW-NEXT: vmovlhps {{.*#+}} xmm7 = xmm3[0],xmm2[0]
384 ; AVX512DQ-BW-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1]
385 ; AVX512DQ-BW-NEXT: vmovaps 112(%rdi), %xmm3
386 ; AVX512DQ-BW-NEXT: vmovlhps {{.*#+}} xmm8 = xmm4[0],xmm3[0]
387 ; AVX512DQ-BW-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1]
388 ; AVX512DQ-BW-NEXT: vmovaps %xmm5, (%rsi)
389 ; AVX512DQ-BW-NEXT: vmovaps %xmm0, (%rdx)
390 ; AVX512DQ-BW-NEXT: vmovaps %xmm6, (%rcx)
391 ; AVX512DQ-BW-NEXT: vmovaps %xmm1, (%r8)
392 ; AVX512DQ-BW-NEXT: vmovaps %xmm7, (%r9)
393 ; AVX512DQ-BW-NEXT: vmovaps %xmm2, (%r11)
394 ; AVX512DQ-BW-NEXT: vmovaps %xmm8, (%r10)
395 ; AVX512DQ-BW-NEXT: vmovaps %xmm3, (%rax)
396 ; AVX512DQ-BW-NEXT: retq
398 ; AVX512DQ-BW-FCP-LABEL: load_i64_stride8_vf2:
399 ; AVX512DQ-BW-FCP: # %bb.0:
400 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
401 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
402 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
403 ; AVX512DQ-BW-FCP-NEXT: vmovaps 64(%rdi), %xmm0
404 ; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %xmm1
405 ; AVX512DQ-BW-FCP-NEXT: vmovaps 16(%rdi), %xmm2
406 ; AVX512DQ-BW-FCP-NEXT: vmovaps 32(%rdi), %xmm3
407 ; AVX512DQ-BW-FCP-NEXT: vmovaps 48(%rdi), %xmm4
408 ; AVX512DQ-BW-FCP-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm0[0]
409 ; AVX512DQ-BW-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
410 ; AVX512DQ-BW-FCP-NEXT: vmovaps 80(%rdi), %xmm1
411 ; AVX512DQ-BW-FCP-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm1[0]
412 ; AVX512DQ-BW-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
413 ; AVX512DQ-BW-FCP-NEXT: vmovaps 96(%rdi), %xmm2
414 ; AVX512DQ-BW-FCP-NEXT: vmovlhps {{.*#+}} xmm7 = xmm3[0],xmm2[0]
415 ; AVX512DQ-BW-FCP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1]
416 ; AVX512DQ-BW-FCP-NEXT: vmovaps 112(%rdi), %xmm3
417 ; AVX512DQ-BW-FCP-NEXT: vmovlhps {{.*#+}} xmm8 = xmm4[0],xmm3[0]
418 ; AVX512DQ-BW-FCP-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1]
419 ; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm5, (%rsi)
420 ; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm0, (%rdx)
421 ; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm6, (%rcx)
422 ; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm1, (%r8)
423 ; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm7, (%r9)
424 ; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm2, (%r11)
425 ; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm8, (%r10)
426 ; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm3, (%rax)
427 ; AVX512DQ-BW-FCP-NEXT: retq
428 %wide.vec = load <16 x i64>, ptr %in.vec, align 64
429 %strided.vec0 = shufflevector <16 x i64> %wide.vec, <16 x i64> poison, <2 x i32> <i32 0, i32 8>
430 %strided.vec1 = shufflevector <16 x i64> %wide.vec, <16 x i64> poison, <2 x i32> <i32 1, i32 9>
431 %strided.vec2 = shufflevector <16 x i64> %wide.vec, <16 x i64> poison, <2 x i32> <i32 2, i32 10>
432 %strided.vec3 = shufflevector <16 x i64> %wide.vec, <16 x i64> poison, <2 x i32> <i32 3, i32 11>
433 %strided.vec4 = shufflevector <16 x i64> %wide.vec, <16 x i64> poison, <2 x i32> <i32 4, i32 12>
434 %strided.vec5 = shufflevector <16 x i64> %wide.vec, <16 x i64> poison, <2 x i32> <i32 5, i32 13>
435 %strided.vec6 = shufflevector <16 x i64> %wide.vec, <16 x i64> poison, <2 x i32> <i32 6, i32 14>
436 %strided.vec7 = shufflevector <16 x i64> %wide.vec, <16 x i64> poison, <2 x i32> <i32 7, i32 15>
437 store <2 x i64> %strided.vec0, ptr %out.vec0, align 64
438 store <2 x i64> %strided.vec1, ptr %out.vec1, align 64
439 store <2 x i64> %strided.vec2, ptr %out.vec2, align 64
440 store <2 x i64> %strided.vec3, ptr %out.vec3, align 64
441 store <2 x i64> %strided.vec4, ptr %out.vec4, align 64
442 store <2 x i64> %strided.vec5, ptr %out.vec5, align 64
443 store <2 x i64> %strided.vec6, ptr %out.vec6, align 64
444 store <2 x i64> %strided.vec7, ptr %out.vec7, align 64
448 define void @load_i64_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind {
449 ; SSE-LABEL: load_i64_stride8_vf4:
451 ; SSE-NEXT: movaps 112(%rdi), %xmm6
452 ; SSE-NEXT: movaps 240(%rdi), %xmm0
453 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
454 ; SSE-NEXT: movaps 96(%rdi), %xmm9
455 ; SSE-NEXT: movaps 224(%rdi), %xmm11
456 ; SSE-NEXT: movaps 160(%rdi), %xmm0
457 ; SSE-NEXT: movaps 80(%rdi), %xmm14
458 ; SSE-NEXT: movaps 208(%rdi), %xmm15
459 ; SSE-NEXT: movaps 144(%rdi), %xmm2
460 ; SSE-NEXT: movaps 64(%rdi), %xmm12
461 ; SSE-NEXT: movaps (%rdi), %xmm7
462 ; SSE-NEXT: movaps 16(%rdi), %xmm5
463 ; SSE-NEXT: movaps 32(%rdi), %xmm3
464 ; SSE-NEXT: movaps 48(%rdi), %xmm4
465 ; SSE-NEXT: movaps 192(%rdi), %xmm13
466 ; SSE-NEXT: movaps 128(%rdi), %xmm8
467 ; SSE-NEXT: movaps %xmm8, %xmm10
468 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm13[0]
469 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm13[1]
470 ; SSE-NEXT: movaps %xmm7, %xmm13
471 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm12[0]
472 ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm12[1]
473 ; SSE-NEXT: movaps %xmm2, %xmm12
474 ; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm15[0]
475 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm15[1]
476 ; SSE-NEXT: movaps %xmm5, %xmm15
477 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0]
478 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm14[1]
479 ; SSE-NEXT: movaps %xmm0, %xmm14
480 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm11[0]
481 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1]
482 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
483 ; SSE-NEXT: movaps %xmm3, %xmm11
484 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm9[0]
485 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm9[1]
486 ; SSE-NEXT: movaps %xmm4, %xmm9
487 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm6[0]
488 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1]
489 ; SSE-NEXT: movaps 176(%rdi), %xmm6
490 ; SSE-NEXT: movaps %xmm6, %xmm1
491 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
492 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
493 ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1]
494 ; SSE-NEXT: movaps %xmm13, (%rsi)
495 ; SSE-NEXT: movaps %xmm10, 16(%rsi)
496 ; SSE-NEXT: movaps %xmm7, (%rdx)
497 ; SSE-NEXT: movaps %xmm8, 16(%rdx)
498 ; SSE-NEXT: movaps %xmm15, (%rcx)
499 ; SSE-NEXT: movaps %xmm12, 16(%rcx)
500 ; SSE-NEXT: movaps %xmm5, (%r8)
501 ; SSE-NEXT: movaps %xmm2, 16(%r8)
502 ; SSE-NEXT: movaps %xmm11, (%r9)
503 ; SSE-NEXT: movaps %xmm14, 16(%r9)
504 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
505 ; SSE-NEXT: movaps %xmm3, (%rax)
506 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
507 ; SSE-NEXT: movaps %xmm0, 16(%rax)
508 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
509 ; SSE-NEXT: movaps %xmm9, (%rax)
510 ; SSE-NEXT: movaps %xmm1, 16(%rax)
511 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
512 ; SSE-NEXT: movaps %xmm6, 16(%rax)
513 ; SSE-NEXT: movaps %xmm4, (%rax)
516 ; AVX-LABEL: load_i64_stride8_vf4:
518 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
519 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
520 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r11
521 ; AVX-NEXT: vmovaps 224(%rdi), %ymm0
522 ; AVX-NEXT: vmovaps 160(%rdi), %ymm1
523 ; AVX-NEXT: vmovaps 192(%rdi), %ymm4
524 ; AVX-NEXT: vmovaps 128(%rdi), %ymm5
525 ; AVX-NEXT: vmovaps 192(%rdi), %xmm3
526 ; AVX-NEXT: vmovaps 128(%rdi), %xmm6
527 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm3[0]
528 ; AVX-NEXT: vmovaps 64(%rdi), %xmm7
529 ; AVX-NEXT: vmovaps (%rdi), %xmm8
530 ; AVX-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm7[0]
531 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm3[1]
532 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm8[1],xmm7[1]
533 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
534 ; AVX-NEXT: vmovaps 80(%rdi), %xmm8
535 ; AVX-NEXT: vmovaps 16(%rdi), %xmm10
536 ; AVX-NEXT: vmovlhps {{.*#+}} xmm11 = xmm10[0],xmm8[0]
537 ; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7]
538 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3]
539 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm10[1],xmm8[1]
540 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
541 ; AVX-NEXT: vmovaps 224(%rdi), %xmm5
542 ; AVX-NEXT: vmovaps 160(%rdi), %xmm8
543 ; AVX-NEXT: vmovlhps {{.*#+}} xmm10 = xmm8[0],xmm5[0]
544 ; AVX-NEXT: vmovaps 96(%rdi), %xmm11
545 ; AVX-NEXT: vmovaps 32(%rdi), %xmm12
546 ; AVX-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm11[0]
547 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm8[1],xmm5[1]
548 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm12[1],xmm11[1]
549 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
550 ; AVX-NEXT: vmovaps 112(%rdi), %xmm12
551 ; AVX-NEXT: vmovaps 48(%rdi), %xmm14
552 ; AVX-NEXT: vmovlhps {{.*#+}} xmm15 = xmm14[0],xmm12[0]
553 ; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7]
554 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
555 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm14[1],xmm12[1]
556 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
557 ; AVX-NEXT: vmovaps %xmm9, (%rsi)
558 ; AVX-NEXT: vmovaps %xmm2, 16(%rsi)
559 ; AVX-NEXT: vmovaps %xmm6, (%rdx)
560 ; AVX-NEXT: vmovaps %xmm3, 16(%rdx)
561 ; AVX-NEXT: vmovaps %ymm7, (%rcx)
562 ; AVX-NEXT: vmovaps %ymm4, (%r8)
563 ; AVX-NEXT: vmovaps %xmm13, (%r9)
564 ; AVX-NEXT: vmovaps %xmm10, 16(%r9)
565 ; AVX-NEXT: vmovaps %xmm8, (%r11)
566 ; AVX-NEXT: vmovaps %xmm5, 16(%r11)
567 ; AVX-NEXT: vmovaps %ymm11, (%r10)
568 ; AVX-NEXT: vmovaps %ymm0, (%rax)
569 ; AVX-NEXT: vzeroupper
572 ; AVX2-LABEL: load_i64_stride8_vf4:
574 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
575 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
576 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
577 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm0
578 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm1
579 ; AVX2-NEXT: vmovaps 224(%rdi), %ymm2
580 ; AVX2-NEXT: vmovaps 160(%rdi), %ymm3
581 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm4
582 ; AVX2-NEXT: vmovaps (%rdi), %ymm5
583 ; AVX2-NEXT: vmovaps 192(%rdi), %ymm6
584 ; AVX2-NEXT: vmovaps 128(%rdi), %ymm7
585 ; AVX2-NEXT: vmovaps (%rdi), %xmm8
586 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm9
587 ; AVX2-NEXT: vmovaps 64(%rdi), %xmm10
588 ; AVX2-NEXT: vinsertf128 $1, 192(%rdi), %ymm10, %ymm10
589 ; AVX2-NEXT: vinsertf128 $1, 128(%rdi), %ymm8, %ymm8
590 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
591 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
592 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm7[0],ymm6[0],ymm7[2],ymm6[2]
593 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
594 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3]
595 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm7[1],ymm6[1],ymm7[3],ymm6[3]
596 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3]
597 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm6[2,3]
598 ; AVX2-NEXT: vmovaps 96(%rdi), %xmm5
599 ; AVX2-NEXT: vinsertf128 $1, 224(%rdi), %ymm5, %ymm5
600 ; AVX2-NEXT: vinsertf128 $1, 160(%rdi), %ymm9, %ymm6
601 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
602 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
603 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
604 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
605 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm9[2,3],ymm6[2,3]
606 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
607 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
608 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
609 ; AVX2-NEXT: vmovaps %ymm11, (%rsi)
610 ; AVX2-NEXT: vmovaps %ymm8, (%rdx)
611 ; AVX2-NEXT: vmovaps %ymm10, (%rcx)
612 ; AVX2-NEXT: vmovaps %ymm4, (%r8)
613 ; AVX2-NEXT: vmovaps %ymm7, (%r9)
614 ; AVX2-NEXT: vmovaps %ymm5, (%r11)
615 ; AVX2-NEXT: vmovaps %ymm6, (%r10)
616 ; AVX2-NEXT: vmovaps %ymm0, (%rax)
617 ; AVX2-NEXT: vzeroupper
620 ; AVX2-FP-LABEL: load_i64_stride8_vf4:
622 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
623 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
624 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11
625 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm0
626 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm1
627 ; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm2
628 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm3
629 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm4
630 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm5
631 ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm6
632 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm7
633 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm8
634 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm9
635 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm10
636 ; AVX2-FP-NEXT: vinsertf128 $1, 192(%rdi), %ymm10, %ymm10
637 ; AVX2-FP-NEXT: vinsertf128 $1, 128(%rdi), %ymm8, %ymm8
638 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
639 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
640 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm7[0],ymm6[0],ymm7[2],ymm6[2]
641 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
642 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3]
643 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm7[1],ymm6[1],ymm7[3],ymm6[3]
644 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3]
645 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm6[2,3]
646 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm5
647 ; AVX2-FP-NEXT: vinsertf128 $1, 224(%rdi), %ymm5, %ymm5
648 ; AVX2-FP-NEXT: vinsertf128 $1, 160(%rdi), %ymm9, %ymm6
649 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
650 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
651 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
652 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
653 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm9[2,3],ymm6[2,3]
654 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
655 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
656 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
657 ; AVX2-FP-NEXT: vmovaps %ymm11, (%rsi)
658 ; AVX2-FP-NEXT: vmovaps %ymm8, (%rdx)
659 ; AVX2-FP-NEXT: vmovaps %ymm10, (%rcx)
660 ; AVX2-FP-NEXT: vmovaps %ymm4, (%r8)
661 ; AVX2-FP-NEXT: vmovaps %ymm7, (%r9)
662 ; AVX2-FP-NEXT: vmovaps %ymm5, (%r11)
663 ; AVX2-FP-NEXT: vmovaps %ymm6, (%r10)
664 ; AVX2-FP-NEXT: vmovaps %ymm0, (%rax)
665 ; AVX2-FP-NEXT: vzeroupper
668 ; AVX2-FCP-LABEL: load_i64_stride8_vf4:
670 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
671 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
672 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
673 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm0
674 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm1
675 ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm2
676 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm3
677 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm4
678 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm5
679 ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm6
680 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm7
681 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm8
682 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm9
683 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm10
684 ; AVX2-FCP-NEXT: vinsertf128 $1, 192(%rdi), %ymm10, %ymm10
685 ; AVX2-FCP-NEXT: vinsertf128 $1, 128(%rdi), %ymm8, %ymm8
686 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
687 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
688 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm7[0],ymm6[0],ymm7[2],ymm6[2]
689 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
690 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3]
691 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm7[1],ymm6[1],ymm7[3],ymm6[3]
692 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3]
693 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm6[2,3]
694 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm5
695 ; AVX2-FCP-NEXT: vinsertf128 $1, 224(%rdi), %ymm5, %ymm5
696 ; AVX2-FCP-NEXT: vinsertf128 $1, 160(%rdi), %ymm9, %ymm6
697 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
698 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
699 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
700 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
701 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm9[2,3],ymm6[2,3]
702 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
703 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
704 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
705 ; AVX2-FCP-NEXT: vmovaps %ymm11, (%rsi)
706 ; AVX2-FCP-NEXT: vmovaps %ymm8, (%rdx)
707 ; AVX2-FCP-NEXT: vmovaps %ymm10, (%rcx)
708 ; AVX2-FCP-NEXT: vmovaps %ymm4, (%r8)
709 ; AVX2-FCP-NEXT: vmovaps %ymm7, (%r9)
710 ; AVX2-FCP-NEXT: vmovaps %ymm5, (%r11)
711 ; AVX2-FCP-NEXT: vmovaps %ymm6, (%r10)
712 ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax)
713 ; AVX2-FCP-NEXT: vzeroupper
714 ; AVX2-FCP-NEXT: retq
716 ; AVX512-LABEL: load_i64_stride8_vf4:
718 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
719 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
720 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
721 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3
722 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm4
723 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm5
724 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm6
725 ; AVX512-NEXT: vmovaps (%rdi), %xmm0
726 ; AVX512-NEXT: vmovaps 64(%rdi), %xmm1
727 ; AVX512-NEXT: vinsertf128 $1, 192(%rdi), %ymm1, %ymm1
728 ; AVX512-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm0
729 ; AVX512-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
730 ; AVX512-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
731 ; AVX512-NEXT: vmovaps 192(%rdi), %ymm1
732 ; AVX512-NEXT: vmovaps 128(%rdi), %ymm2
733 ; AVX512-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
734 ; AVX512-NEXT: vmovaps 64(%rdi), %ymm9
735 ; AVX512-NEXT: vmovaps (%rdi), %ymm10
736 ; AVX512-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[2],ymm9[2]
737 ; AVX512-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3]
738 ; AVX512-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
739 ; AVX512-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm9[1],ymm10[3],ymm9[3]
740 ; AVX512-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
741 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,12]
742 ; AVX512-NEXT: vpermi2q %zmm5, %zmm6, %zmm2
743 ; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,12]
744 ; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm9
745 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7]
746 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,5,13]
747 ; AVX512-NEXT: vpermi2q %zmm5, %zmm6, %zmm9
748 ; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,13]
749 ; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm10
750 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
751 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,6,14]
752 ; AVX512-NEXT: vpermi2q %zmm5, %zmm6, %zmm10
753 ; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm11 = [6,14]
754 ; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm11
755 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
756 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15]
757 ; AVX512-NEXT: vpermi2q %zmm5, %zmm6, %zmm11
758 ; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm5 = [7,15]
759 ; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm5
760 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm11[4,5,6,7]
761 ; AVX512-NEXT: vmovaps %ymm7, (%rsi)
762 ; AVX512-NEXT: vmovaps %ymm8, (%rdx)
763 ; AVX512-NEXT: vmovaps %ymm0, (%rcx)
764 ; AVX512-NEXT: vmovaps %ymm1, (%r8)
765 ; AVX512-NEXT: vmovdqa %ymm2, (%r9)
766 ; AVX512-NEXT: vmovdqa %ymm9, (%r11)
767 ; AVX512-NEXT: vmovdqa %ymm10, (%r10)
768 ; AVX512-NEXT: vmovdqa %ymm3, (%rax)
769 ; AVX512-NEXT: vzeroupper
772 ; AVX512-FCP-LABEL: load_i64_stride8_vf4:
773 ; AVX512-FCP: # %bb.0:
774 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
775 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
776 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
777 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3
778 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm4
779 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5
780 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6
781 ; AVX512-FCP-NEXT: vmovaps (%rdi), %xmm0
782 ; AVX512-FCP-NEXT: vmovaps 64(%rdi), %xmm1
783 ; AVX512-FCP-NEXT: vinsertf128 $1, 192(%rdi), %ymm1, %ymm1
784 ; AVX512-FCP-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm0
785 ; AVX512-FCP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
786 ; AVX512-FCP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
787 ; AVX512-FCP-NEXT: vmovaps 192(%rdi), %ymm1
788 ; AVX512-FCP-NEXT: vmovaps 128(%rdi), %ymm2
789 ; AVX512-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
790 ; AVX512-FCP-NEXT: vmovaps 64(%rdi), %ymm9
791 ; AVX512-FCP-NEXT: vmovaps (%rdi), %ymm10
792 ; AVX512-FCP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[2],ymm9[2]
793 ; AVX512-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3]
794 ; AVX512-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
795 ; AVX512-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm9[1],ymm10[3],ymm9[3]
796 ; AVX512-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
797 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,12]
798 ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm2
799 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,12]
800 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm9
801 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7]
802 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,5,13]
803 ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm9
804 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,13]
805 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm10
806 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
807 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,6,14]
808 ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm10
809 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [6,14]
810 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm11
811 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
812 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15]
813 ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm11
814 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [7,15]
815 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm5
816 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm11[4,5,6,7]
817 ; AVX512-FCP-NEXT: vmovaps %ymm7, (%rsi)
818 ; AVX512-FCP-NEXT: vmovaps %ymm8, (%rdx)
819 ; AVX512-FCP-NEXT: vmovaps %ymm0, (%rcx)
820 ; AVX512-FCP-NEXT: vmovaps %ymm1, (%r8)
821 ; AVX512-FCP-NEXT: vmovdqa %ymm2, (%r9)
822 ; AVX512-FCP-NEXT: vmovdqa %ymm9, (%r11)
823 ; AVX512-FCP-NEXT: vmovdqa %ymm10, (%r10)
824 ; AVX512-FCP-NEXT: vmovdqa %ymm3, (%rax)
825 ; AVX512-FCP-NEXT: vzeroupper
826 ; AVX512-FCP-NEXT: retq
828 ; AVX512DQ-LABEL: load_i64_stride8_vf4:
830 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
831 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
832 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11
833 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm3
834 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm4
835 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm5
836 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm6
837 ; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0
838 ; AVX512DQ-NEXT: vmovaps 64(%rdi), %xmm1
839 ; AVX512DQ-NEXT: vinsertf128 $1, 192(%rdi), %ymm1, %ymm1
840 ; AVX512DQ-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm0
841 ; AVX512DQ-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
842 ; AVX512DQ-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
843 ; AVX512DQ-NEXT: vmovaps 192(%rdi), %ymm1
844 ; AVX512DQ-NEXT: vmovaps 128(%rdi), %ymm2
845 ; AVX512DQ-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
846 ; AVX512DQ-NEXT: vmovaps 64(%rdi), %ymm9
847 ; AVX512DQ-NEXT: vmovaps (%rdi), %ymm10
848 ; AVX512DQ-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[2],ymm9[2]
849 ; AVX512DQ-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3]
850 ; AVX512DQ-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
851 ; AVX512DQ-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm9[1],ymm10[3],ymm9[3]
852 ; AVX512DQ-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
853 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,12]
854 ; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm6, %zmm2
855 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,12]
856 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm4, %zmm9
857 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7]
858 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,5,13]
859 ; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm6, %zmm9
860 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,13]
861 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm4, %zmm10
862 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
863 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,6,14]
864 ; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm6, %zmm10
865 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm11 = [6,14]
866 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm4, %zmm11
867 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
868 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15]
869 ; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm6, %zmm11
870 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm5 = [7,15]
871 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm4, %zmm5
872 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm11[4,5,6,7]
873 ; AVX512DQ-NEXT: vmovaps %ymm7, (%rsi)
874 ; AVX512DQ-NEXT: vmovaps %ymm8, (%rdx)
875 ; AVX512DQ-NEXT: vmovaps %ymm0, (%rcx)
876 ; AVX512DQ-NEXT: vmovaps %ymm1, (%r8)
877 ; AVX512DQ-NEXT: vmovdqa %ymm2, (%r9)
878 ; AVX512DQ-NEXT: vmovdqa %ymm9, (%r11)
879 ; AVX512DQ-NEXT: vmovdqa %ymm10, (%r10)
880 ; AVX512DQ-NEXT: vmovdqa %ymm3, (%rax)
881 ; AVX512DQ-NEXT: vzeroupper
882 ; AVX512DQ-NEXT: retq
884 ; AVX512DQ-FCP-LABEL: load_i64_stride8_vf4:
885 ; AVX512DQ-FCP: # %bb.0:
886 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
887 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
888 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
889 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3
890 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm4
891 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5
892 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6
893 ; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %xmm0
894 ; AVX512DQ-FCP-NEXT: vmovaps 64(%rdi), %xmm1
895 ; AVX512DQ-FCP-NEXT: vinsertf128 $1, 192(%rdi), %ymm1, %ymm1
896 ; AVX512DQ-FCP-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm0
897 ; AVX512DQ-FCP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
898 ; AVX512DQ-FCP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
899 ; AVX512DQ-FCP-NEXT: vmovaps 192(%rdi), %ymm1
900 ; AVX512DQ-FCP-NEXT: vmovaps 128(%rdi), %ymm2
901 ; AVX512DQ-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
902 ; AVX512DQ-FCP-NEXT: vmovaps 64(%rdi), %ymm9
903 ; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %ymm10
904 ; AVX512DQ-FCP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[2],ymm9[2]
905 ; AVX512DQ-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3]
906 ; AVX512DQ-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
907 ; AVX512DQ-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm9[1],ymm10[3],ymm9[3]
908 ; AVX512DQ-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
909 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,12]
910 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm2
911 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,12]
912 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm9
913 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7]
914 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,5,13]
915 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm9
916 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,13]
917 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm10
918 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
919 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,6,14]
920 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm10
921 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [6,14]
922 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm11
923 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
924 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15]
925 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm11
926 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [7,15]
927 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm5
928 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm11[4,5,6,7]
929 ; AVX512DQ-FCP-NEXT: vmovaps %ymm7, (%rsi)
930 ; AVX512DQ-FCP-NEXT: vmovaps %ymm8, (%rdx)
931 ; AVX512DQ-FCP-NEXT: vmovaps %ymm0, (%rcx)
932 ; AVX512DQ-FCP-NEXT: vmovaps %ymm1, (%r8)
933 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%r9)
934 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, (%r11)
935 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, (%r10)
936 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%rax)
937 ; AVX512DQ-FCP-NEXT: vzeroupper
938 ; AVX512DQ-FCP-NEXT: retq
940 ; AVX512BW-LABEL: load_i64_stride8_vf4:
942 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
943 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
944 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
945 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3
946 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm4
947 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm5
948 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm6
949 ; AVX512BW-NEXT: vmovaps (%rdi), %xmm0
950 ; AVX512BW-NEXT: vmovaps 64(%rdi), %xmm1
951 ; AVX512BW-NEXT: vinsertf128 $1, 192(%rdi), %ymm1, %ymm1
952 ; AVX512BW-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm0
953 ; AVX512BW-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
954 ; AVX512BW-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
955 ; AVX512BW-NEXT: vmovaps 192(%rdi), %ymm1
956 ; AVX512BW-NEXT: vmovaps 128(%rdi), %ymm2
957 ; AVX512BW-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
958 ; AVX512BW-NEXT: vmovaps 64(%rdi), %ymm9
959 ; AVX512BW-NEXT: vmovaps (%rdi), %ymm10
960 ; AVX512BW-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[2],ymm9[2]
961 ; AVX512BW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3]
962 ; AVX512BW-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
963 ; AVX512BW-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm9[1],ymm10[3],ymm9[3]
964 ; AVX512BW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
965 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,12]
966 ; AVX512BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm2
967 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,12]
968 ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm9
969 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7]
970 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,5,13]
971 ; AVX512BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm9
972 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,13]
973 ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm10
974 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
975 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,6,14]
976 ; AVX512BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm10
977 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm11 = [6,14]
978 ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm11
979 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
980 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15]
981 ; AVX512BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm11
982 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm5 = [7,15]
983 ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm5
984 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm11[4,5,6,7]
985 ; AVX512BW-NEXT: vmovaps %ymm7, (%rsi)
986 ; AVX512BW-NEXT: vmovaps %ymm8, (%rdx)
987 ; AVX512BW-NEXT: vmovaps %ymm0, (%rcx)
988 ; AVX512BW-NEXT: vmovaps %ymm1, (%r8)
989 ; AVX512BW-NEXT: vmovdqa %ymm2, (%r9)
990 ; AVX512BW-NEXT: vmovdqa %ymm9, (%r11)
991 ; AVX512BW-NEXT: vmovdqa %ymm10, (%r10)
992 ; AVX512BW-NEXT: vmovdqa %ymm3, (%rax)
993 ; AVX512BW-NEXT: vzeroupper
994 ; AVX512BW-NEXT: retq
996 ; AVX512BW-FCP-LABEL: load_i64_stride8_vf4:
997 ; AVX512BW-FCP: # %bb.0:
998 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
999 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1000 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
1001 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3
1002 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4
1003 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5
1004 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6
1005 ; AVX512BW-FCP-NEXT: vmovaps (%rdi), %xmm0
1006 ; AVX512BW-FCP-NEXT: vmovaps 64(%rdi), %xmm1
1007 ; AVX512BW-FCP-NEXT: vinsertf128 $1, 192(%rdi), %ymm1, %ymm1
1008 ; AVX512BW-FCP-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm0
1009 ; AVX512BW-FCP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
1010 ; AVX512BW-FCP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
1011 ; AVX512BW-FCP-NEXT: vmovaps 192(%rdi), %ymm1
1012 ; AVX512BW-FCP-NEXT: vmovaps 128(%rdi), %ymm2
1013 ; AVX512BW-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
1014 ; AVX512BW-FCP-NEXT: vmovaps 64(%rdi), %ymm9
1015 ; AVX512BW-FCP-NEXT: vmovaps (%rdi), %ymm10
1016 ; AVX512BW-FCP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[2],ymm9[2]
1017 ; AVX512BW-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3]
1018 ; AVX512BW-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
1019 ; AVX512BW-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm9[1],ymm10[3],ymm9[3]
1020 ; AVX512BW-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
1021 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,12]
1022 ; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm2
1023 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,12]
1024 ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm9
1025 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7]
1026 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,5,13]
1027 ; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm9
1028 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,13]
1029 ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm10
1030 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
1031 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,6,14]
1032 ; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm10
1033 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [6,14]
1034 ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm11
1035 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
1036 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15]
1037 ; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm11
1038 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [7,15]
1039 ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm5
1040 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm11[4,5,6,7]
1041 ; AVX512BW-FCP-NEXT: vmovaps %ymm7, (%rsi)
1042 ; AVX512BW-FCP-NEXT: vmovaps %ymm8, (%rdx)
1043 ; AVX512BW-FCP-NEXT: vmovaps %ymm0, (%rcx)
1044 ; AVX512BW-FCP-NEXT: vmovaps %ymm1, (%r8)
1045 ; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%r9)
1046 ; AVX512BW-FCP-NEXT: vmovdqa %ymm9, (%r11)
1047 ; AVX512BW-FCP-NEXT: vmovdqa %ymm10, (%r10)
1048 ; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rax)
1049 ; AVX512BW-FCP-NEXT: vzeroupper
1050 ; AVX512BW-FCP-NEXT: retq
1052 ; AVX512DQ-BW-LABEL: load_i64_stride8_vf4:
1053 ; AVX512DQ-BW: # %bb.0:
1054 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
1055 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
1056 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
1057 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm3
1058 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm4
1059 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm5
1060 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm6
1061 ; AVX512DQ-BW-NEXT: vmovaps (%rdi), %xmm0
1062 ; AVX512DQ-BW-NEXT: vmovaps 64(%rdi), %xmm1
1063 ; AVX512DQ-BW-NEXT: vinsertf128 $1, 192(%rdi), %ymm1, %ymm1
1064 ; AVX512DQ-BW-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm0
1065 ; AVX512DQ-BW-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
1066 ; AVX512DQ-BW-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
1067 ; AVX512DQ-BW-NEXT: vmovaps 192(%rdi), %ymm1
1068 ; AVX512DQ-BW-NEXT: vmovaps 128(%rdi), %ymm2
1069 ; AVX512DQ-BW-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
1070 ; AVX512DQ-BW-NEXT: vmovaps 64(%rdi), %ymm9
1071 ; AVX512DQ-BW-NEXT: vmovaps (%rdi), %ymm10
1072 ; AVX512DQ-BW-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[2],ymm9[2]
1073 ; AVX512DQ-BW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3]
1074 ; AVX512DQ-BW-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
1075 ; AVX512DQ-BW-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm9[1],ymm10[3],ymm9[3]
1076 ; AVX512DQ-BW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
1077 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,12]
1078 ; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm2
1079 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,12]
1080 ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm9
1081 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7]
1082 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,5,13]
1083 ; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm9
1084 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,13]
1085 ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm10
1086 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
1087 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,6,14]
1088 ; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm10
1089 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm11 = [6,14]
1090 ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm11
1091 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
1092 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15]
1093 ; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm11
1094 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm5 = [7,15]
1095 ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm5
1096 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm11[4,5,6,7]
1097 ; AVX512DQ-BW-NEXT: vmovaps %ymm7, (%rsi)
1098 ; AVX512DQ-BW-NEXT: vmovaps %ymm8, (%rdx)
1099 ; AVX512DQ-BW-NEXT: vmovaps %ymm0, (%rcx)
1100 ; AVX512DQ-BW-NEXT: vmovaps %ymm1, (%r8)
1101 ; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%r9)
1102 ; AVX512DQ-BW-NEXT: vmovdqa %ymm9, (%r11)
1103 ; AVX512DQ-BW-NEXT: vmovdqa %ymm10, (%r10)
1104 ; AVX512DQ-BW-NEXT: vmovdqa %ymm3, (%rax)
1105 ; AVX512DQ-BW-NEXT: vzeroupper
1106 ; AVX512DQ-BW-NEXT: retq
1108 ; AVX512DQ-BW-FCP-LABEL: load_i64_stride8_vf4:
1109 ; AVX512DQ-BW-FCP: # %bb.0:
1110 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1111 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1112 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
1113 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3
1114 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4
1115 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5
1116 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6
1117 ; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %xmm0
1118 ; AVX512DQ-BW-FCP-NEXT: vmovaps 64(%rdi), %xmm1
1119 ; AVX512DQ-BW-FCP-NEXT: vinsertf128 $1, 192(%rdi), %ymm1, %ymm1
1120 ; AVX512DQ-BW-FCP-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm0
1121 ; AVX512DQ-BW-FCP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
1122 ; AVX512DQ-BW-FCP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
1123 ; AVX512DQ-BW-FCP-NEXT: vmovaps 192(%rdi), %ymm1
1124 ; AVX512DQ-BW-FCP-NEXT: vmovaps 128(%rdi), %ymm2
1125 ; AVX512DQ-BW-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
1126 ; AVX512DQ-BW-FCP-NEXT: vmovaps 64(%rdi), %ymm9
1127 ; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %ymm10
1128 ; AVX512DQ-BW-FCP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[2],ymm9[2]
1129 ; AVX512DQ-BW-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3]
1130 ; AVX512DQ-BW-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
1131 ; AVX512DQ-BW-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm9[1],ymm10[3],ymm9[3]
1132 ; AVX512DQ-BW-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
1133 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,12]
1134 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm2
1135 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,12]
1136 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm9
1137 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7]
1138 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,5,13]
1139 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm9
1140 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,13]
1141 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm10
1142 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
1143 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,6,14]
1144 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm10
1145 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [6,14]
1146 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm11
1147 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
1148 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15]
1149 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm11
1150 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [7,15]
1151 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm5
1152 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm11[4,5,6,7]
1153 ; AVX512DQ-BW-FCP-NEXT: vmovaps %ymm7, (%rsi)
1154 ; AVX512DQ-BW-FCP-NEXT: vmovaps %ymm8, (%rdx)
1155 ; AVX512DQ-BW-FCP-NEXT: vmovaps %ymm0, (%rcx)
1156 ; AVX512DQ-BW-FCP-NEXT: vmovaps %ymm1, (%r8)
1157 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%r9)
1158 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm9, (%r11)
1159 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm10, (%r10)
1160 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rax)
1161 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
1162 ; AVX512DQ-BW-FCP-NEXT: retq
1163 %wide.vec = load <32 x i64>, ptr %in.vec, align 64
1164 %strided.vec0 = shufflevector <32 x i64> %wide.vec, <32 x i64> poison, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
1165 %strided.vec1 = shufflevector <32 x i64> %wide.vec, <32 x i64> poison, <4 x i32> <i32 1, i32 9, i32 17, i32 25>
1166 %strided.vec2 = shufflevector <32 x i64> %wide.vec, <32 x i64> poison, <4 x i32> <i32 2, i32 10, i32 18, i32 26>
1167 %strided.vec3 = shufflevector <32 x i64> %wide.vec, <32 x i64> poison, <4 x i32> <i32 3, i32 11, i32 19, i32 27>
1168 %strided.vec4 = shufflevector <32 x i64> %wide.vec, <32 x i64> poison, <4 x i32> <i32 4, i32 12, i32 20, i32 28>
1169 %strided.vec5 = shufflevector <32 x i64> %wide.vec, <32 x i64> poison, <4 x i32> <i32 5, i32 13, i32 21, i32 29>
1170 %strided.vec6 = shufflevector <32 x i64> %wide.vec, <32 x i64> poison, <4 x i32> <i32 6, i32 14, i32 22, i32 30>
1171 %strided.vec7 = shufflevector <32 x i64> %wide.vec, <32 x i64> poison, <4 x i32> <i32 7, i32 15, i32 23, i32 31>
1172 store <4 x i64> %strided.vec0, ptr %out.vec0, align 64
1173 store <4 x i64> %strided.vec1, ptr %out.vec1, align 64
1174 store <4 x i64> %strided.vec2, ptr %out.vec2, align 64
1175 store <4 x i64> %strided.vec3, ptr %out.vec3, align 64
1176 store <4 x i64> %strided.vec4, ptr %out.vec4, align 64
1177 store <4 x i64> %strided.vec5, ptr %out.vec5, align 64
1178 store <4 x i64> %strided.vec6, ptr %out.vec6, align 64
1179 store <4 x i64> %strided.vec7, ptr %out.vec7, align 64
1183 define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind {
1184 ; SSE-LABEL: load_i64_stride8_vf8:
1186 ; SSE-NEXT: subq $152, %rsp
1187 ; SSE-NEXT: movaps 336(%rdi), %xmm0
1188 ; SSE-NEXT: movaps 464(%rdi), %xmm1
1189 ; SSE-NEXT: movaps 400(%rdi), %xmm8
1190 ; SSE-NEXT: movaps 80(%rdi), %xmm2
1191 ; SSE-NEXT: movaps 208(%rdi), %xmm3
1192 ; SSE-NEXT: movaps 144(%rdi), %xmm9
1193 ; SSE-NEXT: movaps 320(%rdi), %xmm4
1194 ; SSE-NEXT: movaps 256(%rdi), %xmm11
1195 ; SSE-NEXT: movaps 448(%rdi), %xmm5
1196 ; SSE-NEXT: movaps 384(%rdi), %xmm12
1197 ; SSE-NEXT: movaps 64(%rdi), %xmm6
1198 ; SSE-NEXT: movaps (%rdi), %xmm13
1199 ; SSE-NEXT: movaps 16(%rdi), %xmm10
1200 ; SSE-NEXT: movaps 192(%rdi), %xmm7
1201 ; SSE-NEXT: movaps 128(%rdi), %xmm14
1202 ; SSE-NEXT: movaps %xmm14, %xmm15
1203 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm7[0]
1204 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1205 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm7[1]
1206 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1207 ; SSE-NEXT: movaps %xmm13, %xmm7
1208 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm6[0]
1209 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1210 ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm6[1]
1211 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1212 ; SSE-NEXT: movaps %xmm12, %xmm6
1213 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0]
1214 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1215 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1]
1216 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1217 ; SSE-NEXT: movaps %xmm11, %xmm5
1218 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0]
1219 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1220 ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm4[1]
1221 ; SSE-NEXT: movaps %xmm11, (%rsp) # 16-byte Spill
1222 ; SSE-NEXT: movaps %xmm9, %xmm4
1223 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0]
1224 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1225 ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm3[1]
1226 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1227 ; SSE-NEXT: movaps %xmm10, %xmm3
1228 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
1229 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1230 ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1]
1231 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1232 ; SSE-NEXT: movaps %xmm8, %xmm2
1233 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
1234 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1235 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1]
1236 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1237 ; SSE-NEXT: movaps 272(%rdi), %xmm15
1238 ; SSE-NEXT: movaps %xmm15, %xmm1
1239 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1240 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1241 ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1]
1242 ; SSE-NEXT: movaps 96(%rdi), %xmm0
1243 ; SSE-NEXT: movaps 32(%rdi), %xmm14
1244 ; SSE-NEXT: movaps %xmm14, %xmm1
1245 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1246 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1247 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1]
1248 ; SSE-NEXT: movaps 224(%rdi), %xmm0
1249 ; SSE-NEXT: movaps 160(%rdi), %xmm9
1250 ; SSE-NEXT: movaps %xmm9, %xmm1
1251 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1252 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1253 ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1]
1254 ; SSE-NEXT: movaps 352(%rdi), %xmm0
1255 ; SSE-NEXT: movaps 288(%rdi), %xmm10
1256 ; SSE-NEXT: movaps %xmm10, %xmm12
1257 ; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0]
1258 ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1]
1259 ; SSE-NEXT: movaps 480(%rdi), %xmm0
1260 ; SSE-NEXT: movaps 416(%rdi), %xmm8
1261 ; SSE-NEXT: movaps %xmm8, %xmm11
1262 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0]
1263 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1]
1264 ; SSE-NEXT: movaps 112(%rdi), %xmm0
1265 ; SSE-NEXT: movaps 48(%rdi), %xmm7
1266 ; SSE-NEXT: movaps %xmm7, %xmm13
1267 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0]
1268 ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1]
1269 ; SSE-NEXT: movaps 240(%rdi), %xmm0
1270 ; SSE-NEXT: movaps 176(%rdi), %xmm5
1271 ; SSE-NEXT: movaps %xmm5, %xmm6
1272 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0]
1273 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
1274 ; SSE-NEXT: movaps 368(%rdi), %xmm0
1275 ; SSE-NEXT: movaps 304(%rdi), %xmm2
1276 ; SSE-NEXT: movaps %xmm2, %xmm4
1277 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0]
1278 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
1279 ; SSE-NEXT: movaps 496(%rdi), %xmm0
1280 ; SSE-NEXT: movaps 432(%rdi), %xmm1
1281 ; SSE-NEXT: movaps %xmm1, %xmm3
1282 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0]
1283 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1284 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1285 ; SSE-NEXT: movaps %xmm0, 32(%rsi)
1286 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1287 ; SSE-NEXT: movaps %xmm0, 48(%rsi)
1288 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1289 ; SSE-NEXT: movaps %xmm0, (%rsi)
1290 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1291 ; SSE-NEXT: movaps %xmm0, 16(%rsi)
1292 ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
1293 ; SSE-NEXT: movaps %xmm0, 32(%rdx)
1294 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1295 ; SSE-NEXT: movaps %xmm0, 48(%rdx)
1296 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1297 ; SSE-NEXT: movaps %xmm0, (%rdx)
1298 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1299 ; SSE-NEXT: movaps %xmm0, 16(%rdx)
1300 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1301 ; SSE-NEXT: movaps %xmm0, 32(%rcx)
1302 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1303 ; SSE-NEXT: movaps %xmm0, 48(%rcx)
1304 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1305 ; SSE-NEXT: movaps %xmm0, (%rcx)
1306 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1307 ; SSE-NEXT: movaps %xmm0, 16(%rcx)
1308 ; SSE-NEXT: movaps %xmm15, 32(%r8)
1309 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1310 ; SSE-NEXT: movaps %xmm0, 48(%r8)
1311 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1312 ; SSE-NEXT: movaps %xmm0, (%r8)
1313 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1314 ; SSE-NEXT: movaps %xmm0, 16(%r8)
1315 ; SSE-NEXT: movaps %xmm12, 32(%r9)
1316 ; SSE-NEXT: movaps %xmm11, 48(%r9)
1317 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1318 ; SSE-NEXT: movaps %xmm0, (%r9)
1319 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1320 ; SSE-NEXT: movaps %xmm0, 16(%r9)
1321 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
1322 ; SSE-NEXT: movaps %xmm8, 48(%rax)
1323 ; SSE-NEXT: movaps %xmm10, 32(%rax)
1324 ; SSE-NEXT: movaps %xmm9, 16(%rax)
1325 ; SSE-NEXT: movaps %xmm14, (%rax)
1326 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
1327 ; SSE-NEXT: movaps %xmm3, 48(%rax)
1328 ; SSE-NEXT: movaps %xmm4, 32(%rax)
1329 ; SSE-NEXT: movaps %xmm6, 16(%rax)
1330 ; SSE-NEXT: movaps %xmm13, (%rax)
1331 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
1332 ; SSE-NEXT: movaps %xmm1, 48(%rax)
1333 ; SSE-NEXT: movaps %xmm2, 32(%rax)
1334 ; SSE-NEXT: movaps %xmm5, 16(%rax)
1335 ; SSE-NEXT: movaps %xmm7, (%rax)
1336 ; SSE-NEXT: addq $152, %rsp
1339 ; AVX-LABEL: load_i64_stride8_vf8:
1341 ; AVX-NEXT: subq $184, %rsp
1342 ; AVX-NEXT: vmovaps 192(%rdi), %ymm0
1343 ; AVX-NEXT: vmovaps 128(%rdi), %ymm1
1344 ; AVX-NEXT: vmovaps 448(%rdi), %ymm2
1345 ; AVX-NEXT: vmovaps 384(%rdi), %ymm3
1346 ; AVX-NEXT: vmovaps 320(%rdi), %xmm4
1347 ; AVX-NEXT: vmovaps 256(%rdi), %xmm5
1348 ; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0]
1349 ; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1350 ; AVX-NEXT: vmovaps 448(%rdi), %xmm7
1351 ; AVX-NEXT: vmovaps 384(%rdi), %xmm8
1352 ; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm8[0],xmm7[0]
1353 ; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1354 ; AVX-NEXT: vmovaps 192(%rdi), %xmm9
1355 ; AVX-NEXT: vmovaps 128(%rdi), %xmm10
1356 ; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm10[0],xmm9[0]
1357 ; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1358 ; AVX-NEXT: vmovaps 64(%rdi), %xmm11
1359 ; AVX-NEXT: vmovaps (%rdi), %xmm12
1360 ; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm12[0],xmm11[0]
1361 ; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1362 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1]
1363 ; AVX-NEXT: vmovaps %xmm4, (%rsp) # 16-byte Spill
1364 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm8[1],xmm7[1]
1365 ; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1366 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm10[1],xmm9[1]
1367 ; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1368 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm12[1],xmm11[1]
1369 ; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1370 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
1371 ; AVX-NEXT: vmovaps 336(%rdi), %xmm10
1372 ; AVX-NEXT: vmovaps 272(%rdi), %xmm11
1373 ; AVX-NEXT: vmovlhps {{.*#+}} xmm8 = xmm11[0],xmm10[0]
1374 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
1375 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1376 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
1377 ; AVX-NEXT: vmovaps 80(%rdi), %xmm12
1378 ; AVX-NEXT: vmovaps 16(%rdi), %xmm13
1379 ; AVX-NEXT: vmovlhps {{.*#+}} xmm9 = xmm13[0],xmm12[0]
1380 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7]
1381 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1382 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
1383 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm11[1],xmm10[1]
1384 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
1385 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1386 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
1387 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm13[1],xmm12[1]
1388 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1389 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1390 ; AVX-NEXT: vmovaps 480(%rdi), %xmm0
1391 ; AVX-NEXT: vmovaps 416(%rdi), %xmm1
1392 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
1393 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1394 ; AVX-NEXT: vmovaps 352(%rdi), %xmm2
1395 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1396 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1397 ; AVX-NEXT: vmovaps 288(%rdi), %xmm0
1398 ; AVX-NEXT: vmovlhps {{.*#+}} xmm15 = xmm0[0],xmm2[0]
1399 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
1400 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1401 ; AVX-NEXT: vmovaps 224(%rdi), %xmm0
1402 ; AVX-NEXT: vmovaps 160(%rdi), %xmm1
1403 ; AVX-NEXT: vmovlhps {{.*#+}} xmm12 = xmm1[0],xmm0[0]
1404 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm1[1],xmm0[1]
1405 ; AVX-NEXT: vmovaps 96(%rdi), %xmm1
1406 ; AVX-NEXT: vmovaps 32(%rdi), %xmm2
1407 ; AVX-NEXT: vmovlhps {{.*#+}} xmm8 = xmm2[0],xmm1[0]
1408 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm2[1],xmm1[1]
1409 ; AVX-NEXT: vmovaps 480(%rdi), %ymm6
1410 ; AVX-NEXT: vmovaps 416(%rdi), %ymm5
1411 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm6[0],ymm5[2],ymm6[2]
1412 ; AVX-NEXT: vmovaps 368(%rdi), %xmm4
1413 ; AVX-NEXT: vmovaps 304(%rdi), %xmm3
1414 ; AVX-NEXT: vmovlhps {{.*#+}} xmm9 = xmm3[0],xmm4[0]
1415 ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm0[4,5,6,7]
1416 ; AVX-NEXT: vmovaps 224(%rdi), %ymm9
1417 ; AVX-NEXT: vmovaps 160(%rdi), %ymm2
1418 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm2[0],ymm9[0],ymm2[2],ymm9[2]
1419 ; AVX-NEXT: vmovaps 112(%rdi), %xmm1
1420 ; AVX-NEXT: vmovaps 48(%rdi), %xmm0
1421 ; AVX-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm1[0]
1422 ; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7]
1423 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm6[1],ymm5[3],ymm6[3]
1424 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1]
1425 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7]
1426 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm9[1],ymm2[3],ymm9[3]
1427 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1428 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
1429 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1430 ; AVX-NEXT: vmovaps %xmm1, (%rsi)
1431 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1432 ; AVX-NEXT: vmovaps %xmm1, 16(%rsi)
1433 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1434 ; AVX-NEXT: vmovaps %xmm1, 48(%rsi)
1435 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1436 ; AVX-NEXT: vmovaps %xmm1, 32(%rsi)
1437 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1438 ; AVX-NEXT: vmovaps %xmm1, (%rdx)
1439 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1440 ; AVX-NEXT: vmovaps %xmm1, 16(%rdx)
1441 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1442 ; AVX-NEXT: vmovaps %xmm1, 48(%rdx)
1443 ; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
1444 ; AVX-NEXT: vmovaps %xmm1, 32(%rdx)
1445 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1446 ; AVX-NEXT: vmovaps %ymm1, (%rcx)
1447 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1448 ; AVX-NEXT: vmovaps %ymm1, 32(%rcx)
1449 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1450 ; AVX-NEXT: vmovaps %ymm1, (%r8)
1451 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1452 ; AVX-NEXT: vmovaps %ymm1, 32(%r8)
1453 ; AVX-NEXT: vmovaps %xmm8, (%r9)
1454 ; AVX-NEXT: vmovaps %xmm12, 16(%r9)
1455 ; AVX-NEXT: vmovaps %xmm15, 32(%r9)
1456 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1457 ; AVX-NEXT: vmovaps %xmm1, 48(%r9)
1458 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
1459 ; AVX-NEXT: vmovaps %xmm11, (%rax)
1460 ; AVX-NEXT: vmovaps %xmm13, 16(%rax)
1461 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1462 ; AVX-NEXT: vmovaps %xmm1, 32(%rax)
1463 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1464 ; AVX-NEXT: vmovaps %xmm1, 48(%rax)
1465 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
1466 ; AVX-NEXT: vmovaps %ymm7, (%rax)
1467 ; AVX-NEXT: vmovaps %ymm10, 32(%rax)
1468 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
1469 ; AVX-NEXT: vmovaps %ymm0, (%rax)
1470 ; AVX-NEXT: vmovaps %ymm3, 32(%rax)
1471 ; AVX-NEXT: addq $184, %rsp
1472 ; AVX-NEXT: vzeroupper
1475 ; AVX2-LABEL: load_i64_stride8_vf8:
1477 ; AVX2-NEXT: subq $136, %rsp
1478 ; AVX2-NEXT: vmovaps 352(%rdi), %ymm0
1479 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1480 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm8
1481 ; AVX2-NEXT: vmovaps (%rdi), %ymm10
1482 ; AVX2-NEXT: vmovaps 192(%rdi), %ymm11
1483 ; AVX2-NEXT: vmovaps 128(%rdi), %ymm13
1484 ; AVX2-NEXT: vmovaps 320(%rdi), %ymm7
1485 ; AVX2-NEXT: vmovaps 256(%rdi), %ymm14
1486 ; AVX2-NEXT: vmovaps 448(%rdi), %ymm12
1487 ; AVX2-NEXT: vmovaps 384(%rdi), %ymm15
1488 ; AVX2-NEXT: vmovaps 320(%rdi), %xmm1
1489 ; AVX2-NEXT: vinsertf128 $1, 448(%rdi), %ymm1, %ymm2
1490 ; AVX2-NEXT: vmovaps 256(%rdi), %xmm1
1491 ; AVX2-NEXT: vinsertf128 $1, 384(%rdi), %ymm1, %ymm4
1492 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm2[0],ymm4[2],ymm2[2]
1493 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1494 ; AVX2-NEXT: vmovaps (%rdi), %xmm3
1495 ; AVX2-NEXT: vmovaps 64(%rdi), %xmm5
1496 ; AVX2-NEXT: vinsertf128 $1, 192(%rdi), %ymm5, %ymm5
1497 ; AVX2-NEXT: vinsertf128 $1, 128(%rdi), %ymm3, %ymm6
1498 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
1499 ; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
1500 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm2[1],ymm4[3],ymm2[3]
1501 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1502 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
1503 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1504 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
1505 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm14[0],ymm7[0],ymm14[2],ymm7[2]
1506 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3],ymm5[2,3]
1507 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1508 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],ymm11[0],ymm13[2],ymm11[2]
1509 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm10[0],ymm8[0],ymm10[2],ymm8[2]
1510 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm6[2,3]
1511 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1512 ; AVX2-NEXT: vmovaps 288(%rdi), %ymm9
1513 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
1514 ; AVX2-NEXT: vmovaps 480(%rdi), %ymm12
1515 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm14[1],ymm7[1],ymm14[3],ymm7[3]
1516 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm14
1517 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm15[2,3]
1518 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1519 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm13[1],ymm11[1],ymm13[3],ymm11[3]
1520 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm10[1],ymm8[1],ymm10[3],ymm8[3]
1521 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm11[2,3]
1522 ; AVX2-NEXT: vmovaps 352(%rdi), %xmm10
1523 ; AVX2-NEXT: vinsertf128 $1, 480(%rdi), %ymm10, %ymm11
1524 ; AVX2-NEXT: vmovaps 288(%rdi), %xmm10
1525 ; AVX2-NEXT: vinsertf128 $1, 416(%rdi), %ymm10, %ymm13
1526 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm13[0],ymm11[0],ymm13[2],ymm11[2]
1527 ; AVX2-NEXT: vmovaps 96(%rdi), %xmm15
1528 ; AVX2-NEXT: vinsertf128 $1, 224(%rdi), %ymm15, %ymm15
1529 ; AVX2-NEXT: vinsertf128 $1, 160(%rdi), %ymm14, %ymm14
1530 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm13[1],ymm11[1],ymm13[3],ymm11[3]
1531 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm14[0],ymm15[0],ymm14[2],ymm15[2]
1532 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm14[1],ymm15[1],ymm14[3],ymm15[3]
1533 ; AVX2-NEXT: vmovaps 416(%rdi), %ymm15
1534 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
1535 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1536 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm3[0],ymm9[2],ymm3[2]
1537 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm1[2,3],ymm0[2,3]
1538 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm4
1539 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm2
1540 ; AVX2-NEXT: vmovaps 224(%rdi), %ymm1
1541 ; AVX2-NEXT: vmovaps 160(%rdi), %ymm0
1542 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
1543 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm4[0],ymm2[2],ymm4[2]
1544 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3]
1545 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
1546 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm3[1],ymm9[3],ymm3[3]
1547 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm9[2,3],ymm6[2,3]
1548 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
1549 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm4[1],ymm2[3],ymm4[3]
1550 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
1551 ; AVX2-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
1552 ; AVX2-NEXT: vmovaps %ymm1, (%rsi)
1553 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1554 ; AVX2-NEXT: vmovaps %ymm1, 32(%rsi)
1555 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1556 ; AVX2-NEXT: vmovaps %ymm1, (%rdx)
1557 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1558 ; AVX2-NEXT: vmovaps %ymm1, 32(%rdx)
1559 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1560 ; AVX2-NEXT: vmovaps %ymm1, (%rcx)
1561 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1562 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx)
1563 ; AVX2-NEXT: vmovaps %ymm8, (%r8)
1564 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1565 ; AVX2-NEXT: vmovaps %ymm1, 32(%r8)
1566 ; AVX2-NEXT: vmovaps %ymm13, (%r9)
1567 ; AVX2-NEXT: vmovaps %ymm10, 32(%r9)
1568 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
1569 ; AVX2-NEXT: vmovaps %ymm14, (%rax)
1570 ; AVX2-NEXT: vmovaps %ymm11, 32(%rax)
1571 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
1572 ; AVX2-NEXT: vmovaps %ymm5, (%rax)
1573 ; AVX2-NEXT: vmovaps %ymm7, 32(%rax)
1574 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
1575 ; AVX2-NEXT: vmovaps %ymm0, (%rax)
1576 ; AVX2-NEXT: vmovaps %ymm6, 32(%rax)
1577 ; AVX2-NEXT: addq $136, %rsp
1578 ; AVX2-NEXT: vzeroupper
1581 ; AVX2-FP-LABEL: load_i64_stride8_vf8:
1583 ; AVX2-FP-NEXT: subq $136, %rsp
1584 ; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm0
1585 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1586 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm8
1587 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm10
1588 ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm11
1589 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm13
1590 ; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm7
1591 ; AVX2-FP-NEXT: vmovaps 256(%rdi), %ymm14
1592 ; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm12
1593 ; AVX2-FP-NEXT: vmovaps 384(%rdi), %ymm15
1594 ; AVX2-FP-NEXT: vmovaps 320(%rdi), %xmm1
1595 ; AVX2-FP-NEXT: vinsertf128 $1, 448(%rdi), %ymm1, %ymm2
1596 ; AVX2-FP-NEXT: vmovaps 256(%rdi), %xmm1
1597 ; AVX2-FP-NEXT: vinsertf128 $1, 384(%rdi), %ymm1, %ymm4
1598 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm2[0],ymm4[2],ymm2[2]
1599 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1600 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm3
1601 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm5
1602 ; AVX2-FP-NEXT: vinsertf128 $1, 192(%rdi), %ymm5, %ymm5
1603 ; AVX2-FP-NEXT: vinsertf128 $1, 128(%rdi), %ymm3, %ymm6
1604 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
1605 ; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
1606 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm2[1],ymm4[3],ymm2[3]
1607 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1608 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
1609 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1610 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
1611 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm14[0],ymm7[0],ymm14[2],ymm7[2]
1612 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3],ymm5[2,3]
1613 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1614 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],ymm11[0],ymm13[2],ymm11[2]
1615 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm10[0],ymm8[0],ymm10[2],ymm8[2]
1616 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm6[2,3]
1617 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1618 ; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm9
1619 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
1620 ; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm12
1621 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm14[1],ymm7[1],ymm14[3],ymm7[3]
1622 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm14
1623 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm15[2,3]
1624 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1625 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm13[1],ymm11[1],ymm13[3],ymm11[3]
1626 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm10[1],ymm8[1],ymm10[3],ymm8[3]
1627 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm11[2,3]
1628 ; AVX2-FP-NEXT: vmovaps 352(%rdi), %xmm10
1629 ; AVX2-FP-NEXT: vinsertf128 $1, 480(%rdi), %ymm10, %ymm11
1630 ; AVX2-FP-NEXT: vmovaps 288(%rdi), %xmm10
1631 ; AVX2-FP-NEXT: vinsertf128 $1, 416(%rdi), %ymm10, %ymm13
1632 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm13[0],ymm11[0],ymm13[2],ymm11[2]
1633 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm15
1634 ; AVX2-FP-NEXT: vinsertf128 $1, 224(%rdi), %ymm15, %ymm15
1635 ; AVX2-FP-NEXT: vinsertf128 $1, 160(%rdi), %ymm14, %ymm14
1636 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm13[1],ymm11[1],ymm13[3],ymm11[3]
1637 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm14[0],ymm15[0],ymm14[2],ymm15[2]
1638 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm14[1],ymm15[1],ymm14[3],ymm15[3]
1639 ; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm15
1640 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
1641 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1642 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm3[0],ymm9[2],ymm3[2]
1643 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm1[2,3],ymm0[2,3]
1644 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm4
1645 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm2
1646 ; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm1
1647 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm0
1648 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
1649 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm4[0],ymm2[2],ymm4[2]
1650 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3]
1651 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
1652 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm3[1],ymm9[3],ymm3[3]
1653 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm9[2,3],ymm6[2,3]
1654 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
1655 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm4[1],ymm2[3],ymm4[3]
1656 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
1657 ; AVX2-FP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
1658 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rsi)
1659 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1660 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rsi)
1661 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1662 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rdx)
1663 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1664 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rdx)
1665 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1666 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rcx)
1667 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1668 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rcx)
1669 ; AVX2-FP-NEXT: vmovaps %ymm8, (%r8)
1670 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1671 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%r8)
1672 ; AVX2-FP-NEXT: vmovaps %ymm13, (%r9)
1673 ; AVX2-FP-NEXT: vmovaps %ymm10, 32(%r9)
1674 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1675 ; AVX2-FP-NEXT: vmovaps %ymm14, (%rax)
1676 ; AVX2-FP-NEXT: vmovaps %ymm11, 32(%rax)
1677 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1678 ; AVX2-FP-NEXT: vmovaps %ymm5, (%rax)
1679 ; AVX2-FP-NEXT: vmovaps %ymm7, 32(%rax)
1680 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1681 ; AVX2-FP-NEXT: vmovaps %ymm0, (%rax)
1682 ; AVX2-FP-NEXT: vmovaps %ymm6, 32(%rax)
1683 ; AVX2-FP-NEXT: addq $136, %rsp
1684 ; AVX2-FP-NEXT: vzeroupper
1685 ; AVX2-FP-NEXT: retq
1687 ; AVX2-FCP-LABEL: load_i64_stride8_vf8:
1688 ; AVX2-FCP: # %bb.0:
1689 ; AVX2-FCP-NEXT: subq $136, %rsp
1690 ; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm0
1691 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1692 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm8
1693 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm10
1694 ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm11
1695 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm13
1696 ; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm7
1697 ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm14
1698 ; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm12
1699 ; AVX2-FCP-NEXT: vmovaps 384(%rdi), %ymm15
1700 ; AVX2-FCP-NEXT: vmovaps 320(%rdi), %xmm1
1701 ; AVX2-FCP-NEXT: vinsertf128 $1, 448(%rdi), %ymm1, %ymm2
1702 ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %xmm1
1703 ; AVX2-FCP-NEXT: vinsertf128 $1, 384(%rdi), %ymm1, %ymm4
1704 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm2[0],ymm4[2],ymm2[2]
1705 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1706 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm3
1707 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm5
1708 ; AVX2-FCP-NEXT: vinsertf128 $1, 192(%rdi), %ymm5, %ymm5
1709 ; AVX2-FCP-NEXT: vinsertf128 $1, 128(%rdi), %ymm3, %ymm6
1710 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
1711 ; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
1712 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm2[1],ymm4[3],ymm2[3]
1713 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1714 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
1715 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1716 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
1717 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm14[0],ymm7[0],ymm14[2],ymm7[2]
1718 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3],ymm5[2,3]
1719 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1720 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],ymm11[0],ymm13[2],ymm11[2]
1721 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm10[0],ymm8[0],ymm10[2],ymm8[2]
1722 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm6[2,3]
1723 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1724 ; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm9
1725 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
1726 ; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm12
1727 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm14[1],ymm7[1],ymm14[3],ymm7[3]
1728 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm14
1729 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm15[2,3]
1730 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1731 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm13[1],ymm11[1],ymm13[3],ymm11[3]
1732 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm10[1],ymm8[1],ymm10[3],ymm8[3]
1733 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm11[2,3]
1734 ; AVX2-FCP-NEXT: vmovaps 352(%rdi), %xmm10
1735 ; AVX2-FCP-NEXT: vinsertf128 $1, 480(%rdi), %ymm10, %ymm11
1736 ; AVX2-FCP-NEXT: vmovaps 288(%rdi), %xmm10
1737 ; AVX2-FCP-NEXT: vinsertf128 $1, 416(%rdi), %ymm10, %ymm13
1738 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm13[0],ymm11[0],ymm13[2],ymm11[2]
1739 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm15
1740 ; AVX2-FCP-NEXT: vinsertf128 $1, 224(%rdi), %ymm15, %ymm15
1741 ; AVX2-FCP-NEXT: vinsertf128 $1, 160(%rdi), %ymm14, %ymm14
1742 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm13[1],ymm11[1],ymm13[3],ymm11[3]
1743 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm14[0],ymm15[0],ymm14[2],ymm15[2]
1744 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm14[1],ymm15[1],ymm14[3],ymm15[3]
1745 ; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm15
1746 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
1747 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1748 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm3[0],ymm9[2],ymm3[2]
1749 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm1[2,3],ymm0[2,3]
1750 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm4
1751 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm2
1752 ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm1
1753 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm0
1754 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
1755 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm4[0],ymm2[2],ymm4[2]
1756 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3]
1757 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
1758 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm3[1],ymm9[3],ymm3[3]
1759 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm9[2,3],ymm6[2,3]
1760 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
1761 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm4[1],ymm2[3],ymm4[3]
1762 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
1763 ; AVX2-FCP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
1764 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rsi)
1765 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1766 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rsi)
1767 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1768 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rdx)
1769 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1770 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rdx)
1771 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1772 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rcx)
1773 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1774 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rcx)
1775 ; AVX2-FCP-NEXT: vmovaps %ymm8, (%r8)
1776 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1777 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%r8)
1778 ; AVX2-FCP-NEXT: vmovaps %ymm13, (%r9)
1779 ; AVX2-FCP-NEXT: vmovaps %ymm10, 32(%r9)
1780 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1781 ; AVX2-FCP-NEXT: vmovaps %ymm14, (%rax)
1782 ; AVX2-FCP-NEXT: vmovaps %ymm11, 32(%rax)
1783 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1784 ; AVX2-FCP-NEXT: vmovaps %ymm5, (%rax)
1785 ; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%rax)
1786 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1787 ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax)
1788 ; AVX2-FCP-NEXT: vmovaps %ymm6, 32(%rax)
1789 ; AVX2-FCP-NEXT: addq $136, %rsp
1790 ; AVX2-FCP-NEXT: vzeroupper
1791 ; AVX2-FCP-NEXT: retq
1793 ; AVX512-LABEL: load_i64_stride8_vf8:
1795 ; AVX512-NEXT: pushq %rbx
1796 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
1797 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
1798 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
1799 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm2
1800 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1
1801 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm7
1802 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm6
1803 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm3
1804 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm0
1805 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm4
1806 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm5
1807 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8]
1808 ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1809 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm9
1810 ; AVX512-NEXT: vpermt2q %zmm4, %zmm8, %zmm9
1811 ; AVX512-NEXT: vpermi2q %zmm3, %zmm0, %zmm8
1812 ; AVX512-NEXT: movb $-64, %bl
1813 ; AVX512-NEXT: kmovw %ebx, %k1
1814 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1}
1815 ; AVX512-NEXT: vmovdqa (%rdi), %xmm9
1816 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm10
1817 ; AVX512-NEXT: vinserti128 $1, 192(%rdi), %ymm10, %ymm10
1818 ; AVX512-NEXT: vinserti128 $1, 128(%rdi), %ymm9, %ymm9
1819 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm10[0],ymm9[2],ymm10[2]
1820 ; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm16
1821 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [1,9,1,9,1,9,1,9]
1822 ; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1823 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm12
1824 ; AVX512-NEXT: vpermt2q %zmm4, %zmm11, %zmm12
1825 ; AVX512-NEXT: vpermi2q %zmm3, %zmm0, %zmm11
1826 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm11 {%k1}
1827 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm10[1],ymm9[3],ymm10[3]
1828 ; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9
1829 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10]
1830 ; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1831 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm11
1832 ; AVX512-NEXT: vpermt2q %zmm4, %zmm10, %zmm11
1833 ; AVX512-NEXT: vpermi2q %zmm3, %zmm0, %zmm10
1834 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1}
1835 ; AVX512-NEXT: vmovdqa 192(%rdi), %ymm11
1836 ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm12
1837 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
1838 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm14
1839 ; AVX512-NEXT: vmovdqa (%rdi), %ymm15
1840 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
1841 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm13[2,3]
1842 ; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm10
1843 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11]
1844 ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1845 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13
1846 ; AVX512-NEXT: vpermt2q %zmm4, %zmm8, %zmm13
1847 ; AVX512-NEXT: vpermi2q %zmm3, %zmm0, %zmm8
1848 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm8 {%k1}
1849 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
1850 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
1851 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
1852 ; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm8
1853 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [4,12,4,12,4,12,4,12]
1854 ; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1855 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm12
1856 ; AVX512-NEXT: vpermt2q %zmm7, %zmm11, %zmm12
1857 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm13
1858 ; AVX512-NEXT: vpermt2q %zmm2, %zmm11, %zmm13
1859 ; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
1860 ; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm11
1861 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6]
1862 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1}
1863 ; AVX512-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm11
1864 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
1865 ; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1866 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm13
1867 ; AVX512-NEXT: vpermt2q %zmm7, %zmm12, %zmm13
1868 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm14
1869 ; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm14
1870 ; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7]
1871 ; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm12
1872 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm0[1],zmm3[1],zmm0[3],zmm3[3],zmm0[5],zmm3[5],zmm0[7],zmm3[7]
1873 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1}
1874 ; AVX512-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm12
1875 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
1876 ; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1877 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm14
1878 ; AVX512-NEXT: vpermt2q %zmm7, %zmm13, %zmm14
1879 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm15
1880 ; AVX512-NEXT: vpermt2q %zmm2, %zmm13, %zmm15
1881 ; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7]
1882 ; AVX512-NEXT: vpermi2q %zmm3, %zmm0, %zmm13
1883 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm5[0],zmm4[0],zmm5[2],zmm4[2],zmm5[4],zmm4[4],zmm5[6],zmm4[6]
1884 ; AVX512-NEXT: vinserti64x4 $0, %ymm14, %zmm13, %zmm13
1885 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [7,15,7,15,7,15,7,15]
1886 ; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1887 ; AVX512-NEXT: vpermt2q %zmm7, %zmm14, %zmm6
1888 ; AVX512-NEXT: vpermt2q %zmm2, %zmm14, %zmm1
1889 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
1890 ; AVX512-NEXT: vpermt2q %zmm3, %zmm14, %zmm0
1891 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm5[1],zmm4[1],zmm5[3],zmm4[3],zmm5[5],zmm4[5],zmm5[7],zmm4[7]
1892 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
1893 ; AVX512-NEXT: vmovdqa64 %zmm16, (%rsi)
1894 ; AVX512-NEXT: vmovdqa64 %zmm9, (%rdx)
1895 ; AVX512-NEXT: vmovdqa64 %zmm10, (%rcx)
1896 ; AVX512-NEXT: vmovdqa64 %zmm8, (%r8)
1897 ; AVX512-NEXT: vmovdqa64 %zmm11, (%r9)
1898 ; AVX512-NEXT: vmovdqa64 %zmm12, (%r11)
1899 ; AVX512-NEXT: vmovdqa64 %zmm13, (%r10)
1900 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rax)
1901 ; AVX512-NEXT: popq %rbx
1902 ; AVX512-NEXT: vzeroupper
1905 ; AVX512-FCP-LABEL: load_i64_stride8_vf8:
1906 ; AVX512-FCP: # %bb.0:
1907 ; AVX512-FCP-NEXT: pushq %rbx
1908 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1909 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1910 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
1911 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
1912 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
1913 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm7
1914 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6
1915 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm3
1916 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm0
1917 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4
1918 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5
1919 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8]
1920 ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1921 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm9
1922 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm9
1923 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm8
1924 ; AVX512-FCP-NEXT: movb $-64, %bl
1925 ; AVX512-FCP-NEXT: kmovw %ebx, %k1
1926 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1}
1927 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm9
1928 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm10
1929 ; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm10, %ymm10
1930 ; AVX512-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm9, %ymm9
1931 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm10[0],ymm9[2],ymm10[2]
1932 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm16
1933 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [1,9,1,9,1,9,1,9]
1934 ; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1935 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm12
1936 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm12
1937 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm11
1938 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 {%k1}
1939 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm10[1],ymm9[3],ymm10[3]
1940 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9
1941 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10]
1942 ; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1943 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm11
1944 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm11
1945 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm10
1946 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1}
1947 ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm11
1948 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm12
1949 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
1950 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm14
1951 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm15
1952 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
1953 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm13[2,3]
1954 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm10
1955 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11]
1956 ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1957 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
1958 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm13
1959 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm8
1960 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 {%k1}
1961 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
1962 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
1963 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
1964 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm8
1965 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [4,12,4,12,4,12,4,12]
1966 ; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1967 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm12
1968 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm12
1969 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm13
1970 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm13
1971 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
1972 ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm11
1973 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6]
1974 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1}
1975 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm11
1976 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
1977 ; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1978 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm13
1979 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm13
1980 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm14
1981 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm14
1982 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7]
1983 ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm12
1984 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm0[1],zmm3[1],zmm0[3],zmm3[3],zmm0[5],zmm3[5],zmm0[7],zmm3[7]
1985 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1}
1986 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm12
1987 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
1988 ; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1989 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm14
1990 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm14
1991 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm15
1992 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm15
1993 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7]
1994 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm13
1995 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm5[0],zmm4[0],zmm5[2],zmm4[2],zmm5[4],zmm4[4],zmm5[6],zmm4[6]
1996 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm13, %zmm13
1997 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [7,15,7,15,7,15,7,15]
1998 ; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1999 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm6
2000 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1
2001 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
2002 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm0
2003 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm5[1],zmm4[1],zmm5[3],zmm4[3],zmm5[5],zmm4[5],zmm5[7],zmm4[7]
2004 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
2005 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%rsi)
2006 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, (%rdx)
2007 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rcx)
2008 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%r8)
2009 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%r9)
2010 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, (%r11)
2011 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, (%r10)
2012 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
2013 ; AVX512-FCP-NEXT: popq %rbx
2014 ; AVX512-FCP-NEXT: vzeroupper
2015 ; AVX512-FCP-NEXT: retq
2017 ; AVX512DQ-LABEL: load_i64_stride8_vf8:
2018 ; AVX512DQ: # %bb.0:
2019 ; AVX512DQ-NEXT: pushq %rbx
2020 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
2021 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
2022 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11
2023 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm2
2024 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm1
2025 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm7
2026 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm6
2027 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm3
2028 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm0
2029 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm4
2030 ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm5
2031 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8]
2032 ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2033 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm9
2034 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm8, %zmm9
2035 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm0, %zmm8
2036 ; AVX512DQ-NEXT: movb $-64, %bl
2037 ; AVX512DQ-NEXT: kmovw %ebx, %k1
2038 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1}
2039 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm9
2040 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm10
2041 ; AVX512DQ-NEXT: vinserti128 $1, 192(%rdi), %ymm10, %ymm10
2042 ; AVX512DQ-NEXT: vinserti128 $1, 128(%rdi), %ymm9, %ymm9
2043 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm10[0],ymm9[2],ymm10[2]
2044 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm16
2045 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [1,9,1,9,1,9,1,9]
2046 ; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2047 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm12
2048 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm11, %zmm12
2049 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm0, %zmm11
2050 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm11 {%k1}
2051 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm10[1],ymm9[3],ymm10[3]
2052 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9
2053 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10]
2054 ; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2055 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm11
2056 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm10, %zmm11
2057 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm0, %zmm10
2058 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1}
2059 ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm11
2060 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm12
2061 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
2062 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm14
2063 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm15
2064 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
2065 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm13[2,3]
2066 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm10
2067 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11]
2068 ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2069 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13
2070 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm8, %zmm13
2071 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm0, %zmm8
2072 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm8 {%k1}
2073 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
2074 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
2075 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
2076 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm8
2077 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [4,12,4,12,4,12,4,12]
2078 ; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2079 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm12
2080 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm11, %zmm12
2081 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm13
2082 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm11, %zmm13
2083 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
2084 ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm11
2085 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6]
2086 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1}
2087 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm11
2088 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
2089 ; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2090 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm13
2091 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm12, %zmm13
2092 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm14
2093 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm12, %zmm14
2094 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7]
2095 ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm12
2096 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm0[1],zmm3[1],zmm0[3],zmm3[3],zmm0[5],zmm3[5],zmm0[7],zmm3[7]
2097 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1}
2098 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm12
2099 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
2100 ; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2101 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm14
2102 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm13, %zmm14
2103 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm15
2104 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm13, %zmm15
2105 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7]
2106 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm0, %zmm13
2107 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm5[0],zmm4[0],zmm5[2],zmm4[2],zmm5[4],zmm4[4],zmm5[6],zmm4[6]
2108 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm14, %zmm13, %zmm13
2109 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [7,15,7,15,7,15,7,15]
2110 ; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2111 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm14, %zmm6
2112 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm14, %zmm1
2113 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
2114 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm14, %zmm0
2115 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm5[1],zmm4[1],zmm5[3],zmm4[3],zmm5[5],zmm4[5],zmm5[7],zmm4[7]
2116 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
2117 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, (%rsi)
2118 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%rdx)
2119 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rcx)
2120 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%r8)
2121 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%r9)
2122 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, (%r11)
2123 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, (%r10)
2124 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax)
2125 ; AVX512DQ-NEXT: popq %rbx
2126 ; AVX512DQ-NEXT: vzeroupper
2127 ; AVX512DQ-NEXT: retq
2129 ; AVX512DQ-FCP-LABEL: load_i64_stride8_vf8:
2130 ; AVX512DQ-FCP: # %bb.0:
2131 ; AVX512DQ-FCP-NEXT: pushq %rbx
2132 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2133 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
2134 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
2135 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
2136 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
2137 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm7
2138 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6
2139 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm3
2140 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm0
2141 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4
2142 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5
2143 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8]
2144 ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2145 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm9
2146 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm9
2147 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm8
2148 ; AVX512DQ-FCP-NEXT: movb $-64, %bl
2149 ; AVX512DQ-FCP-NEXT: kmovw %ebx, %k1
2150 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1}
2151 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm9
2152 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm10
2153 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm10, %ymm10
2154 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm9, %ymm9
2155 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm10[0],ymm9[2],ymm10[2]
2156 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm16
2157 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [1,9,1,9,1,9,1,9]
2158 ; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2159 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm12
2160 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm12
2161 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm11
2162 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 {%k1}
2163 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm10[1],ymm9[3],ymm10[3]
2164 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9
2165 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10]
2166 ; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2167 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm11
2168 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm11
2169 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm10
2170 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1}
2171 ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm11
2172 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm12
2173 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
2174 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm14
2175 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm15
2176 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
2177 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm13[2,3]
2178 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm10
2179 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11]
2180 ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2181 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
2182 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm13
2183 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm8
2184 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 {%k1}
2185 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
2186 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
2187 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
2188 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm8
2189 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [4,12,4,12,4,12,4,12]
2190 ; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2191 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm12
2192 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm12
2193 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm13
2194 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm13
2195 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
2196 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm11
2197 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6]
2198 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1}
2199 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm11
2200 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
2201 ; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2202 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm13
2203 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm13
2204 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm14
2205 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm14
2206 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7]
2207 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm12
2208 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm0[1],zmm3[1],zmm0[3],zmm3[3],zmm0[5],zmm3[5],zmm0[7],zmm3[7]
2209 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1}
2210 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm12
2211 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
2212 ; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2213 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm14
2214 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm14
2215 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm15
2216 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm15
2217 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7]
2218 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm13
2219 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm5[0],zmm4[0],zmm5[2],zmm4[2],zmm5[4],zmm4[4],zmm5[6],zmm4[6]
2220 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm13, %zmm13
2221 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [7,15,7,15,7,15,7,15]
2222 ; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2223 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm6
2224 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1
2225 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
2226 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm0
2227 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm5[1],zmm4[1],zmm5[3],zmm4[3],zmm5[5],zmm4[5],zmm5[7],zmm4[7]
2228 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
2229 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%rsi)
2230 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, (%rdx)
2231 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rcx)
2232 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%r8)
2233 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%r9)
2234 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, (%r11)
2235 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, (%r10)
2236 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
2237 ; AVX512DQ-FCP-NEXT: popq %rbx
2238 ; AVX512DQ-FCP-NEXT: vzeroupper
2239 ; AVX512DQ-FCP-NEXT: retq
2241 ; AVX512BW-LABEL: load_i64_stride8_vf8:
2242 ; AVX512BW: # %bb.0:
2243 ; AVX512BW-NEXT: pushq %rbx
2244 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
2245 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
2246 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
2247 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2
2248 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1
2249 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm7
2250 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm6
2251 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm3
2252 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm0
2253 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm4
2254 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm5
2255 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8]
2256 ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2257 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9
2258 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm9
2259 ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm8
2260 ; AVX512BW-NEXT: movb $-64, %bl
2261 ; AVX512BW-NEXT: kmovd %ebx, %k1
2262 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1}
2263 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm9
2264 ; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm10
2265 ; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm10, %ymm10
2266 ; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm9, %ymm9
2267 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm10[0],ymm9[2],ymm10[2]
2268 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm16
2269 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [1,9,1,9,1,9,1,9]
2270 ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2271 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12
2272 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm12
2273 ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm11
2274 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm11 {%k1}
2275 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm10[1],ymm9[3],ymm10[3]
2276 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9
2277 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10]
2278 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2279 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11
2280 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm11
2281 ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm10
2282 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1}
2283 ; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm11
2284 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm12
2285 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
2286 ; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm14
2287 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm15
2288 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
2289 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm13[2,3]
2290 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm10
2291 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11]
2292 ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2293 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13
2294 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm13
2295 ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm8
2296 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm8 {%k1}
2297 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
2298 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
2299 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
2300 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm8
2301 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [4,12,4,12,4,12,4,12]
2302 ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2303 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12
2304 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm11, %zmm12
2305 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13
2306 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm13
2307 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
2308 ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm11
2309 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6]
2310 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1}
2311 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm11
2312 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
2313 ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2314 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13
2315 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm13
2316 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14
2317 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm14
2318 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7]
2319 ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm12
2320 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm0[1],zmm3[1],zmm0[3],zmm3[3],zmm0[5],zmm3[5],zmm0[7],zmm3[7]
2321 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1}
2322 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm12
2323 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
2324 ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2325 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14
2326 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm13, %zmm14
2327 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15
2328 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm15
2329 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7]
2330 ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm13
2331 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm5[0],zmm4[0],zmm5[2],zmm4[2],zmm5[4],zmm4[4],zmm5[6],zmm4[6]
2332 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm13, %zmm13
2333 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [7,15,7,15,7,15,7,15]
2334 ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2335 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm6
2336 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm1
2337 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
2338 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0
2339 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm5[1],zmm4[1],zmm5[3],zmm4[3],zmm5[5],zmm4[5],zmm5[7],zmm4[7]
2340 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
2341 ; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rsi)
2342 ; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rdx)
2343 ; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rcx)
2344 ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r8)
2345 ; AVX512BW-NEXT: vmovdqa64 %zmm11, (%r9)
2346 ; AVX512BW-NEXT: vmovdqa64 %zmm12, (%r11)
2347 ; AVX512BW-NEXT: vmovdqa64 %zmm13, (%r10)
2348 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax)
2349 ; AVX512BW-NEXT: popq %rbx
2350 ; AVX512BW-NEXT: vzeroupper
2351 ; AVX512BW-NEXT: retq
2353 ; AVX512BW-FCP-LABEL: load_i64_stride8_vf8:
2354 ; AVX512BW-FCP: # %bb.0:
2355 ; AVX512BW-FCP-NEXT: pushq %rbx
2356 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2357 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
2358 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
2359 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
2360 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
2361 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm7
2362 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6
2363 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm3
2364 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm0
2365 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4
2366 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5
2367 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8]
2368 ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2369 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9
2370 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm9
2371 ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm8
2372 ; AVX512BW-FCP-NEXT: movb $-64, %bl
2373 ; AVX512BW-FCP-NEXT: kmovd %ebx, %k1
2374 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1}
2375 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm9
2376 ; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm10
2377 ; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm10, %ymm10
2378 ; AVX512BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm9, %ymm9
2379 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm10[0],ymm9[2],ymm10[2]
2380 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm16
2381 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [1,9,1,9,1,9,1,9]
2382 ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2383 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12
2384 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm12
2385 ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm11
2386 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 {%k1}
2387 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm10[1],ymm9[3],ymm10[3]
2388 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9
2389 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10]
2390 ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2391 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm11
2392 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm11
2393 ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm10
2394 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1}
2395 ; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm11
2396 ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm12
2397 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
2398 ; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm14
2399 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm15
2400 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
2401 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm13[2,3]
2402 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm10
2403 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11]
2404 ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2405 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
2406 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm13
2407 ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm8
2408 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 {%k1}
2409 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
2410 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
2411 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
2412 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm8
2413 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [4,12,4,12,4,12,4,12]
2414 ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2415 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12
2416 ; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm12
2417 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13
2418 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm13
2419 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
2420 ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm11
2421 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6]
2422 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1}
2423 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm11
2424 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
2425 ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2426 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13
2427 ; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm13
2428 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm14
2429 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm14
2430 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7]
2431 ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm12
2432 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm0[1],zmm3[1],zmm0[3],zmm3[3],zmm0[5],zmm3[5],zmm0[7],zmm3[7]
2433 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1}
2434 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm12
2435 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
2436 ; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2437 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14
2438 ; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm14
2439 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm15
2440 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm15
2441 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7]
2442 ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm13
2443 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm5[0],zmm4[0],zmm5[2],zmm4[2],zmm5[4],zmm4[4],zmm5[6],zmm4[6]
2444 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm13, %zmm13
2445 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [7,15,7,15,7,15,7,15]
2446 ; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2447 ; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm6
2448 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1
2449 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
2450 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm0
2451 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm5[1],zmm4[1],zmm5[3],zmm4[3],zmm5[5],zmm4[5],zmm5[7],zmm4[7]
2452 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
2453 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, (%rsi)
2454 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rdx)
2455 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rcx)
2456 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%r8)
2457 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, (%r9)
2458 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%r11)
2459 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, (%r10)
2460 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
2461 ; AVX512BW-FCP-NEXT: popq %rbx
2462 ; AVX512BW-FCP-NEXT: vzeroupper
2463 ; AVX512BW-FCP-NEXT: retq
2465 ; AVX512DQ-BW-LABEL: load_i64_stride8_vf8:
2466 ; AVX512DQ-BW: # %bb.0:
2467 ; AVX512DQ-BW-NEXT: pushq %rbx
2468 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
2469 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
2470 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
2471 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2
2472 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1
2473 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm7
2474 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm6
2475 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm3
2476 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm0
2477 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm4
2478 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm5
2479 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8]
2480 ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2481 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm9
2482 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm9
2483 ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm8
2484 ; AVX512DQ-BW-NEXT: movb $-64, %bl
2485 ; AVX512DQ-BW-NEXT: kmovd %ebx, %k1
2486 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1}
2487 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm9
2488 ; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm10
2489 ; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rdi), %ymm10, %ymm10
2490 ; AVX512DQ-BW-NEXT: vinserti128 $1, 128(%rdi), %ymm9, %ymm9
2491 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm10[0],ymm9[2],ymm10[2]
2492 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm16
2493 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [1,9,1,9,1,9,1,9]
2494 ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2495 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm12
2496 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm12
2497 ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm11
2498 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm11 {%k1}
2499 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm10[1],ymm9[3],ymm10[3]
2500 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9
2501 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10]
2502 ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2503 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm11
2504 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm11
2505 ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm10
2506 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1}
2507 ; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm11
2508 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm12
2509 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
2510 ; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm14
2511 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm15
2512 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
2513 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm13[2,3]
2514 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm10
2515 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11]
2516 ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2517 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13
2518 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm13
2519 ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm8
2520 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm8 {%k1}
2521 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
2522 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
2523 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
2524 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm8
2525 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [4,12,4,12,4,12,4,12]
2526 ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2527 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm12
2528 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm11, %zmm12
2529 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm13
2530 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm13
2531 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
2532 ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm11
2533 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6]
2534 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1}
2535 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm11
2536 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
2537 ; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2538 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm13
2539 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm13
2540 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm14
2541 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm14
2542 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7]
2543 ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm12
2544 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm0[1],zmm3[1],zmm0[3],zmm3[3],zmm0[5],zmm3[5],zmm0[7],zmm3[7]
2545 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1}
2546 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm12
2547 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
2548 ; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2549 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm14
2550 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm13, %zmm14
2551 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm15
2552 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm15
2553 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7]
2554 ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm13
2555 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm5[0],zmm4[0],zmm5[2],zmm4[2],zmm5[4],zmm4[4],zmm5[6],zmm4[6]
2556 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm14, %zmm13, %zmm13
2557 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [7,15,7,15,7,15,7,15]
2558 ; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2559 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm6
2560 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm1
2561 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
2562 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0
2563 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm5[1],zmm4[1],zmm5[3],zmm4[3],zmm5[5],zmm4[5],zmm5[7],zmm4[7]
2564 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
2565 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, (%rsi)
2566 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%rdx)
2567 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rcx)
2568 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%r8)
2569 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, (%r9)
2570 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, (%r11)
2571 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, (%r10)
2572 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax)
2573 ; AVX512DQ-BW-NEXT: popq %rbx
2574 ; AVX512DQ-BW-NEXT: vzeroupper
2575 ; AVX512DQ-BW-NEXT: retq
2577 ; AVX512DQ-BW-FCP-LABEL: load_i64_stride8_vf8:
2578 ; AVX512DQ-BW-FCP: # %bb.0:
2579 ; AVX512DQ-BW-FCP-NEXT: pushq %rbx
2580 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2581 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
2582 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
2583 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
2584 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
2585 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm7
2586 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6
2587 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm3
2588 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm0
2589 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4
2590 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5
2591 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8]
2592 ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2593 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9
2594 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm9
2595 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm8
2596 ; AVX512DQ-BW-FCP-NEXT: movb $-64, %bl
2597 ; AVX512DQ-BW-FCP-NEXT: kmovd %ebx, %k1
2598 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1}
2599 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm9
2600 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm10
2601 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm10, %ymm10
2602 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm9, %ymm9
2603 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm10[0],ymm9[2],ymm10[2]
2604 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm16
2605 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [1,9,1,9,1,9,1,9]
2606 ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2607 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12
2608 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm12
2609 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm11
2610 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 {%k1}
2611 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm10[1],ymm9[3],ymm10[3]
2612 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9
2613 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10]
2614 ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2615 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm11
2616 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm11
2617 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm10
2618 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1}
2619 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm11
2620 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm12
2621 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
2622 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm14
2623 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm15
2624 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
2625 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm13[2,3]
2626 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm10
2627 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11]
2628 ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2629 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
2630 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm13
2631 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm8
2632 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 {%k1}
2633 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
2634 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
2635 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
2636 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm8
2637 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [4,12,4,12,4,12,4,12]
2638 ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2639 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12
2640 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm12
2641 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13
2642 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm13
2643 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
2644 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm11
2645 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6]
2646 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1}
2647 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm11
2648 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
2649 ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2650 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13
2651 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm13
2652 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm14
2653 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm14
2654 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7]
2655 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm12
2656 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm0[1],zmm3[1],zmm0[3],zmm3[3],zmm0[5],zmm3[5],zmm0[7],zmm3[7]
2657 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1}
2658 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm12
2659 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
2660 ; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2661 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14
2662 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm14
2663 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm15
2664 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm15
2665 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7]
2666 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm13
2667 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm5[0],zmm4[0],zmm5[2],zmm4[2],zmm5[4],zmm4[4],zmm5[6],zmm4[6]
2668 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm13, %zmm13
2669 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [7,15,7,15,7,15,7,15]
2670 ; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2671 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm6
2672 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1
2673 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
2674 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm0
2675 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm5[1],zmm4[1],zmm5[3],zmm4[3],zmm5[5],zmm4[5],zmm5[7],zmm4[7]
2676 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
2677 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, (%rsi)
2678 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%rdx)
2679 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rcx)
2680 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%r8)
2681 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, (%r9)
2682 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%r11)
2683 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, (%r10)
2684 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
2685 ; AVX512DQ-BW-FCP-NEXT: popq %rbx
2686 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
2687 ; AVX512DQ-BW-FCP-NEXT: retq
2688 %wide.vec = load <64 x i64>, ptr %in.vec, align 64
2689 %strided.vec0 = shufflevector <64 x i64> %wide.vec, <64 x i64> poison, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
2690 %strided.vec1 = shufflevector <64 x i64> %wide.vec, <64 x i64> poison, <8 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57>
2691 %strided.vec2 = shufflevector <64 x i64> %wide.vec, <64 x i64> poison, <8 x i32> <i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58>
2692 %strided.vec3 = shufflevector <64 x i64> %wide.vec, <64 x i64> poison, <8 x i32> <i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59>
2693 %strided.vec4 = shufflevector <64 x i64> %wide.vec, <64 x i64> poison, <8 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60>
2694 %strided.vec5 = shufflevector <64 x i64> %wide.vec, <64 x i64> poison, <8 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61>
2695 %strided.vec6 = shufflevector <64 x i64> %wide.vec, <64 x i64> poison, <8 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62>
2696 %strided.vec7 = shufflevector <64 x i64> %wide.vec, <64 x i64> poison, <8 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63>
2697 store <8 x i64> %strided.vec0, ptr %out.vec0, align 64
2698 store <8 x i64> %strided.vec1, ptr %out.vec1, align 64
2699 store <8 x i64> %strided.vec2, ptr %out.vec2, align 64
2700 store <8 x i64> %strided.vec3, ptr %out.vec3, align 64
2701 store <8 x i64> %strided.vec4, ptr %out.vec4, align 64
2702 store <8 x i64> %strided.vec5, ptr %out.vec5, align 64
2703 store <8 x i64> %strided.vec6, ptr %out.vec6, align 64
2704 store <8 x i64> %strided.vec7, ptr %out.vec7, align 64
2708 define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind {
2709 ; SSE-LABEL: load_i64_stride8_vf16:
2711 ; SSE-NEXT: subq $664, %rsp # imm = 0x298
2712 ; SSE-NEXT: movaps 832(%rdi), %xmm0
2713 ; SSE-NEXT: movaps 320(%rdi), %xmm1
2714 ; SSE-NEXT: movaps 256(%rdi), %xmm8
2715 ; SSE-NEXT: movaps 960(%rdi), %xmm2
2716 ; SSE-NEXT: movaps 896(%rdi), %xmm9
2717 ; SSE-NEXT: movaps 448(%rdi), %xmm3
2718 ; SSE-NEXT: movaps 384(%rdi), %xmm10
2719 ; SSE-NEXT: movaps 576(%rdi), %xmm4
2720 ; SSE-NEXT: movaps 512(%rdi), %xmm11
2721 ; SSE-NEXT: movaps 64(%rdi), %xmm5
2722 ; SSE-NEXT: movaps (%rdi), %xmm12
2723 ; SSE-NEXT: movaps 704(%rdi), %xmm6
2724 ; SSE-NEXT: movaps 640(%rdi), %xmm13
2725 ; SSE-NEXT: movaps 192(%rdi), %xmm7
2726 ; SSE-NEXT: movaps 128(%rdi), %xmm14
2727 ; SSE-NEXT: movaps %xmm14, %xmm15
2728 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm7[0]
2729 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2730 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm7[1]
2731 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2732 ; SSE-NEXT: movaps %xmm13, %xmm7
2733 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm6[0]
2734 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2735 ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm6[1]
2736 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2737 ; SSE-NEXT: movaps %xmm12, %xmm6
2738 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0]
2739 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2740 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1]
2741 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2742 ; SSE-NEXT: movaps %xmm11, %xmm5
2743 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0]
2744 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2745 ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm4[1]
2746 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2747 ; SSE-NEXT: movaps %xmm10, %xmm4
2748 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0]
2749 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2750 ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm3[1]
2751 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2752 ; SSE-NEXT: movaps %xmm9, %xmm3
2753 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
2754 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2755 ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm2[1]
2756 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2757 ; SSE-NEXT: movaps %xmm8, %xmm2
2758 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
2759 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2760 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1]
2761 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2762 ; SSE-NEXT: movaps 768(%rdi), %xmm1
2763 ; SSE-NEXT: movaps %xmm1, %xmm2
2764 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
2765 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2766 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2767 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2768 ; SSE-NEXT: movaps 80(%rdi), %xmm0
2769 ; SSE-NEXT: movaps 16(%rdi), %xmm1
2770 ; SSE-NEXT: movaps %xmm1, %xmm2
2771 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
2772 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2773 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2774 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2775 ; SSE-NEXT: movaps 208(%rdi), %xmm0
2776 ; SSE-NEXT: movaps 144(%rdi), %xmm1
2777 ; SSE-NEXT: movaps %xmm1, %xmm2
2778 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
2779 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2780 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2781 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2782 ; SSE-NEXT: movaps 336(%rdi), %xmm0
2783 ; SSE-NEXT: movaps 272(%rdi), %xmm1
2784 ; SSE-NEXT: movaps %xmm1, %xmm2
2785 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
2786 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2787 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2788 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2789 ; SSE-NEXT: movaps 464(%rdi), %xmm0
2790 ; SSE-NEXT: movaps 400(%rdi), %xmm1
2791 ; SSE-NEXT: movaps %xmm1, %xmm2
2792 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
2793 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2794 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2795 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2796 ; SSE-NEXT: movaps 592(%rdi), %xmm0
2797 ; SSE-NEXT: movaps 528(%rdi), %xmm1
2798 ; SSE-NEXT: movaps %xmm1, %xmm2
2799 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
2800 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2801 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2802 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2803 ; SSE-NEXT: movaps 720(%rdi), %xmm0
2804 ; SSE-NEXT: movaps 656(%rdi), %xmm1
2805 ; SSE-NEXT: movaps %xmm1, %xmm2
2806 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
2807 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2808 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2809 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2810 ; SSE-NEXT: movaps 848(%rdi), %xmm0
2811 ; SSE-NEXT: movaps 784(%rdi), %xmm1
2812 ; SSE-NEXT: movaps %xmm1, %xmm2
2813 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
2814 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2815 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2816 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2817 ; SSE-NEXT: movaps 976(%rdi), %xmm0
2818 ; SSE-NEXT: movaps 912(%rdi), %xmm1
2819 ; SSE-NEXT: movaps %xmm1, %xmm2
2820 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
2821 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2822 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2823 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2824 ; SSE-NEXT: movaps 96(%rdi), %xmm0
2825 ; SSE-NEXT: movaps 32(%rdi), %xmm1
2826 ; SSE-NEXT: movaps %xmm1, %xmm2
2827 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
2828 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2829 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2830 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2831 ; SSE-NEXT: movaps 224(%rdi), %xmm0
2832 ; SSE-NEXT: movaps 160(%rdi), %xmm1
2833 ; SSE-NEXT: movaps %xmm1, %xmm2
2834 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
2835 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2836 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2837 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2838 ; SSE-NEXT: movaps 352(%rdi), %xmm0
2839 ; SSE-NEXT: movaps 288(%rdi), %xmm1
2840 ; SSE-NEXT: movaps %xmm1, %xmm2
2841 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
2842 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2843 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2844 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2845 ; SSE-NEXT: movaps 480(%rdi), %xmm0
2846 ; SSE-NEXT: movaps 416(%rdi), %xmm1
2847 ; SSE-NEXT: movaps %xmm1, %xmm2
2848 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
2849 ; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill
2850 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2851 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2852 ; SSE-NEXT: movaps 608(%rdi), %xmm0
2853 ; SSE-NEXT: movaps 544(%rdi), %xmm1
2854 ; SSE-NEXT: movaps %xmm1, %xmm2
2855 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
2856 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2857 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2858 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2859 ; SSE-NEXT: movaps 736(%rdi), %xmm0
2860 ; SSE-NEXT: movaps 672(%rdi), %xmm1
2861 ; SSE-NEXT: movaps %xmm1, %xmm2
2862 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
2863 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2864 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2865 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2866 ; SSE-NEXT: movaps 864(%rdi), %xmm0
2867 ; SSE-NEXT: movaps 800(%rdi), %xmm12
2868 ; SSE-NEXT: movaps %xmm12, %xmm1
2869 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2870 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2871 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1]
2872 ; SSE-NEXT: movaps 992(%rdi), %xmm0
2873 ; SSE-NEXT: movaps 928(%rdi), %xmm9
2874 ; SSE-NEXT: movaps %xmm9, %xmm1
2875 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2876 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2877 ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1]
2878 ; SSE-NEXT: movaps 112(%rdi), %xmm0
2879 ; SSE-NEXT: movaps 48(%rdi), %xmm1
2880 ; SSE-NEXT: movaps %xmm1, %xmm2
2881 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
2882 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2883 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2884 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2885 ; SSE-NEXT: movaps 240(%rdi), %xmm0
2886 ; SSE-NEXT: movaps 176(%rdi), %xmm14
2887 ; SSE-NEXT: movaps %xmm14, %xmm1
2888 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2889 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2890 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1]
2891 ; SSE-NEXT: movaps 368(%rdi), %xmm0
2892 ; SSE-NEXT: movaps 304(%rdi), %xmm13
2893 ; SSE-NEXT: movaps %xmm13, %xmm15
2894 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0]
2895 ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1]
2896 ; SSE-NEXT: movaps 496(%rdi), %xmm0
2897 ; SSE-NEXT: movaps 432(%rdi), %xmm10
2898 ; SSE-NEXT: movaps %xmm10, %xmm11
2899 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0]
2900 ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1]
2901 ; SSE-NEXT: movaps 624(%rdi), %xmm0
2902 ; SSE-NEXT: movaps 560(%rdi), %xmm5
2903 ; SSE-NEXT: movaps %xmm5, %xmm8
2904 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0]
2905 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
2906 ; SSE-NEXT: movaps 752(%rdi), %xmm0
2907 ; SSE-NEXT: movaps 688(%rdi), %xmm6
2908 ; SSE-NEXT: movaps %xmm6, %xmm7
2909 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0]
2910 ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1]
2911 ; SSE-NEXT: movaps 880(%rdi), %xmm0
2912 ; SSE-NEXT: movaps 816(%rdi), %xmm1
2913 ; SSE-NEXT: movaps %xmm1, %xmm4
2914 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0]
2915 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2916 ; SSE-NEXT: movaps 1008(%rdi), %xmm0
2917 ; SSE-NEXT: movaps 944(%rdi), %xmm2
2918 ; SSE-NEXT: movaps %xmm2, %xmm3
2919 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0]
2920 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
2921 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2922 ; SSE-NEXT: movaps %xmm0, 96(%rsi)
2923 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2924 ; SSE-NEXT: movaps %xmm0, 32(%rsi)
2925 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2926 ; SSE-NEXT: movaps %xmm0, 112(%rsi)
2927 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2928 ; SSE-NEXT: movaps %xmm0, 48(%rsi)
2929 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2930 ; SSE-NEXT: movaps %xmm0, 64(%rsi)
2931 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2932 ; SSE-NEXT: movaps %xmm0, (%rsi)
2933 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2934 ; SSE-NEXT: movaps %xmm0, 80(%rsi)
2935 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2936 ; SSE-NEXT: movaps %xmm0, 16(%rsi)
2937 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2938 ; SSE-NEXT: movaps %xmm0, 96(%rdx)
2939 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2940 ; SSE-NEXT: movaps %xmm0, 32(%rdx)
2941 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2942 ; SSE-NEXT: movaps %xmm0, 112(%rdx)
2943 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2944 ; SSE-NEXT: movaps %xmm0, 48(%rdx)
2945 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2946 ; SSE-NEXT: movaps %xmm0, 64(%rdx)
2947 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2948 ; SSE-NEXT: movaps %xmm0, (%rdx)
2949 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2950 ; SSE-NEXT: movaps %xmm0, 80(%rdx)
2951 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2952 ; SSE-NEXT: movaps %xmm0, 16(%rdx)
2953 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2954 ; SSE-NEXT: movaps %xmm0, 96(%rcx)
2955 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2956 ; SSE-NEXT: movaps %xmm0, 32(%rcx)
2957 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2958 ; SSE-NEXT: movaps %xmm0, 112(%rcx)
2959 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2960 ; SSE-NEXT: movaps %xmm0, 48(%rcx)
2961 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2962 ; SSE-NEXT: movaps %xmm0, 64(%rcx)
2963 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2964 ; SSE-NEXT: movaps %xmm0, (%rcx)
2965 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2966 ; SSE-NEXT: movaps %xmm0, 80(%rcx)
2967 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2968 ; SSE-NEXT: movaps %xmm0, 16(%rcx)
2969 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2970 ; SSE-NEXT: movaps %xmm0, 112(%r8)
2971 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2972 ; SSE-NEXT: movaps %xmm0, 96(%r8)
2973 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2974 ; SSE-NEXT: movaps %xmm0, 80(%r8)
2975 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2976 ; SSE-NEXT: movaps %xmm0, 64(%r8)
2977 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2978 ; SSE-NEXT: movaps %xmm0, 48(%r8)
2979 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2980 ; SSE-NEXT: movaps %xmm0, 32(%r8)
2981 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2982 ; SSE-NEXT: movaps %xmm0, 16(%r8)
2983 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2984 ; SSE-NEXT: movaps %xmm0, (%r8)
2985 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2986 ; SSE-NEXT: movaps %xmm0, 112(%r9)
2987 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2988 ; SSE-NEXT: movaps %xmm0, 96(%r9)
2989 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2990 ; SSE-NEXT: movaps %xmm0, 80(%r9)
2991 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2992 ; SSE-NEXT: movaps %xmm0, 64(%r9)
2993 ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
2994 ; SSE-NEXT: movaps %xmm0, 48(%r9)
2995 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2996 ; SSE-NEXT: movaps %xmm0, 32(%r9)
2997 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2998 ; SSE-NEXT: movaps %xmm0, 16(%r9)
2999 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3000 ; SSE-NEXT: movaps %xmm0, (%r9)
3001 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
3002 ; SSE-NEXT: movaps %xmm9, 112(%rax)
3003 ; SSE-NEXT: movaps %xmm12, 96(%rax)
3004 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3005 ; SSE-NEXT: movaps %xmm0, 80(%rax)
3006 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3007 ; SSE-NEXT: movaps %xmm0, 64(%rax)
3008 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3009 ; SSE-NEXT: movaps %xmm0, 48(%rax)
3010 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3011 ; SSE-NEXT: movaps %xmm0, 32(%rax)
3012 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3013 ; SSE-NEXT: movaps %xmm0, 16(%rax)
3014 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3015 ; SSE-NEXT: movaps %xmm0, (%rax)
3016 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
3017 ; SSE-NEXT: movaps %xmm3, 112(%rax)
3018 ; SSE-NEXT: movaps %xmm4, 96(%rax)
3019 ; SSE-NEXT: movaps %xmm7, 80(%rax)
3020 ; SSE-NEXT: movaps %xmm8, 64(%rax)
3021 ; SSE-NEXT: movaps %xmm11, 48(%rax)
3022 ; SSE-NEXT: movaps %xmm15, 32(%rax)
3023 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3024 ; SSE-NEXT: movaps %xmm0, 16(%rax)
3025 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3026 ; SSE-NEXT: movaps %xmm0, (%rax)
3027 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
3028 ; SSE-NEXT: movaps %xmm2, 112(%rax)
3029 ; SSE-NEXT: movaps %xmm1, 96(%rax)
3030 ; SSE-NEXT: movaps %xmm6, 80(%rax)
3031 ; SSE-NEXT: movaps %xmm5, 64(%rax)
3032 ; SSE-NEXT: movaps %xmm10, 48(%rax)
3033 ; SSE-NEXT: movaps %xmm13, 32(%rax)
3034 ; SSE-NEXT: movaps %xmm14, 16(%rax)
3035 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3036 ; SSE-NEXT: movaps %xmm0, (%rax)
3037 ; SSE-NEXT: addq $664, %rsp # imm = 0x298
3040 ; AVX-LABEL: load_i64_stride8_vf16:
3042 ; AVX-NEXT: subq $808, %rsp # imm = 0x328
3043 ; AVX-NEXT: vmovaps 448(%rdi), %xmm0
3044 ; AVX-NEXT: vmovaps 384(%rdi), %xmm1
3045 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
3046 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3047 ; AVX-NEXT: vmovaps 320(%rdi), %xmm2
3048 ; AVX-NEXT: vmovaps 256(%rdi), %xmm3
3049 ; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0]
3050 ; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3051 ; AVX-NEXT: vmovaps 832(%rdi), %xmm4
3052 ; AVX-NEXT: vmovaps 768(%rdi), %xmm5
3053 ; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0]
3054 ; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3055 ; AVX-NEXT: vmovaps 960(%rdi), %xmm6
3056 ; AVX-NEXT: vmovaps 896(%rdi), %xmm7
3057 ; AVX-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm6[0]
3058 ; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3059 ; AVX-NEXT: vmovaps 704(%rdi), %xmm8
3060 ; AVX-NEXT: vmovaps 640(%rdi), %xmm9
3061 ; AVX-NEXT: vmovlhps {{.*#+}} xmm10 = xmm9[0],xmm8[0]
3062 ; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3063 ; AVX-NEXT: vmovaps 576(%rdi), %xmm10
3064 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
3065 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3066 ; AVX-NEXT: vmovaps 512(%rdi), %xmm0
3067 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm2[1]
3068 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3069 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm10[0]
3070 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3071 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm6[1]
3072 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3073 ; AVX-NEXT: vmovaps 64(%rdi), %xmm1
3074 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm4[1]
3075 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3076 ; AVX-NEXT: vmovaps 192(%rdi), %xmm2
3077 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm9[1],xmm8[1]
3078 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3079 ; AVX-NEXT: vmovaps 128(%rdi), %xmm3
3080 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1]
3081 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3082 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm2[0]
3083 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3084 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1]
3085 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3086 ; AVX-NEXT: vmovaps (%rdi), %xmm0
3087 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
3088 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3089 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
3090 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3091 ; AVX-NEXT: vmovaps 448(%rdi), %ymm0
3092 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3093 ; AVX-NEXT: vmovaps 384(%rdi), %ymm1
3094 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3095 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
3096 ; AVX-NEXT: vmovaps 336(%rdi), %xmm3
3097 ; AVX-NEXT: vmovaps 272(%rdi), %xmm4
3098 ; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm3[0]
3099 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm2[4,5,6,7]
3100 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3101 ; AVX-NEXT: vmovaps 960(%rdi), %ymm2
3102 ; AVX-NEXT: vmovaps 896(%rdi), %ymm5
3103 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm2[0],ymm5[2],ymm2[2]
3104 ; AVX-NEXT: vmovaps 848(%rdi), %xmm7
3105 ; AVX-NEXT: vmovaps 784(%rdi), %xmm8
3106 ; AVX-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm7[0]
3107 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm6[4,5,6,7]
3108 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3109 ; AVX-NEXT: vmovaps 192(%rdi), %ymm6
3110 ; AVX-NEXT: vmovaps 128(%rdi), %ymm9
3111 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm6[0],ymm9[2],ymm6[2]
3112 ; AVX-NEXT: vmovaps 80(%rdi), %xmm11
3113 ; AVX-NEXT: vmovaps 16(%rdi), %xmm12
3114 ; AVX-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm11[0]
3115 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm10[4,5,6,7]
3116 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3117 ; AVX-NEXT: vmovaps 704(%rdi), %ymm10
3118 ; AVX-NEXT: vmovaps 640(%rdi), %ymm13
3119 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm13[0],ymm10[0],ymm13[2],ymm10[2]
3120 ; AVX-NEXT: vmovaps 592(%rdi), %xmm15
3121 ; AVX-NEXT: vmovaps 528(%rdi), %xmm0
3122 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm15[0]
3123 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7]
3124 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3125 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3126 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
3127 ; AVX-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3]
3128 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1]
3129 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
3130 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3131 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm2[1],ymm5[3],ymm2[3]
3132 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm8[1],xmm7[1]
3133 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
3134 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3135 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm9[1],ymm6[1],ymm9[3],ymm6[3]
3136 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm12[1],xmm11[1]
3137 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
3138 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3139 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm13[1],ymm10[1],ymm13[3],ymm10[3]
3140 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1]
3141 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3142 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3143 ; AVX-NEXT: vmovaps 480(%rdi), %xmm0
3144 ; AVX-NEXT: vmovaps 416(%rdi), %xmm1
3145 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
3146 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3147 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
3148 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3149 ; AVX-NEXT: vmovaps 352(%rdi), %xmm0
3150 ; AVX-NEXT: vmovaps 288(%rdi), %xmm1
3151 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
3152 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3153 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
3154 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3155 ; AVX-NEXT: vmovaps 992(%rdi), %xmm0
3156 ; AVX-NEXT: vmovaps 928(%rdi), %xmm1
3157 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
3158 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3159 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
3160 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3161 ; AVX-NEXT: vmovaps 864(%rdi), %xmm0
3162 ; AVX-NEXT: vmovaps 800(%rdi), %xmm1
3163 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
3164 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3165 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
3166 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3167 ; AVX-NEXT: vmovaps 224(%rdi), %xmm0
3168 ; AVX-NEXT: vmovaps 160(%rdi), %xmm1
3169 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
3170 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3171 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
3172 ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
3173 ; AVX-NEXT: vmovaps 96(%rdi), %xmm0
3174 ; AVX-NEXT: vmovaps 32(%rdi), %xmm1
3175 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
3176 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3177 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
3178 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3179 ; AVX-NEXT: vmovaps 736(%rdi), %xmm0
3180 ; AVX-NEXT: vmovaps 672(%rdi), %xmm1
3181 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
3182 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3183 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
3184 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3185 ; AVX-NEXT: vmovaps 608(%rdi), %xmm0
3186 ; AVX-NEXT: vmovaps 544(%rdi), %xmm1
3187 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
3188 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3189 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
3190 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3191 ; AVX-NEXT: vmovaps 480(%rdi), %ymm0
3192 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3193 ; AVX-NEXT: vmovaps 416(%rdi), %ymm14
3194 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm0[0],ymm14[2],ymm0[2]
3195 ; AVX-NEXT: vmovaps 368(%rdi), %xmm12
3196 ; AVX-NEXT: vmovaps 304(%rdi), %xmm6
3197 ; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm6[0],xmm12[0]
3198 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm1[4,5,6,7]
3199 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3200 ; AVX-NEXT: vmovaps 992(%rdi), %ymm0
3201 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3202 ; AVX-NEXT: vmovaps 928(%rdi), %ymm15
3203 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm15[0],ymm0[0],ymm15[2],ymm0[2]
3204 ; AVX-NEXT: vmovaps 880(%rdi), %xmm11
3205 ; AVX-NEXT: vmovaps 816(%rdi), %xmm10
3206 ; AVX-NEXT: vmovlhps {{.*#+}} xmm9 = xmm10[0],xmm11[0]
3207 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7]
3208 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3209 ; AVX-NEXT: vmovaps 224(%rdi), %ymm9
3210 ; AVX-NEXT: vmovaps 160(%rdi), %ymm4
3211 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm4[0],ymm9[0],ymm4[2],ymm9[2]
3212 ; AVX-NEXT: vmovaps 112(%rdi), %xmm3
3213 ; AVX-NEXT: vmovaps 48(%rdi), %xmm0
3214 ; AVX-NEXT: vmovlhps {{.*#+}} xmm13 = xmm0[0],xmm3[0]
3215 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm8[4,5,6,7]
3216 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3217 ; AVX-NEXT: vmovaps 736(%rdi), %ymm7
3218 ; AVX-NEXT: vmovaps 672(%rdi), %ymm5
3219 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm5[0],ymm7[0],ymm5[2],ymm7[2]
3220 ; AVX-NEXT: vmovaps 624(%rdi), %xmm2
3221 ; AVX-NEXT: vmovaps 560(%rdi), %xmm1
3222 ; AVX-NEXT: vmovlhps {{.*#+}} xmm13 = xmm1[0],xmm2[0]
3223 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
3224 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm9[1],ymm4[3],ymm9[3]
3225 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
3226 ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm4[4,5,6,7]
3227 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload
3228 ; AVX-NEXT: # ymm0 = ymm14[1],mem[1],ymm14[3],mem[3]
3229 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm12[1]
3230 ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm0[4,5,6,7]
3231 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm7[1],ymm5[3],ymm7[3]
3232 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
3233 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3234 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload
3235 ; AVX-NEXT: # ymm0 = ymm15[1],mem[1],ymm15[3],mem[3]
3236 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm10[1],xmm11[1]
3237 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3238 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3239 ; AVX-NEXT: vmovaps %xmm1, 16(%rsi)
3240 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3241 ; AVX-NEXT: vmovaps %xmm1, (%rsi)
3242 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3243 ; AVX-NEXT: vmovaps %xmm1, 64(%rsi)
3244 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3245 ; AVX-NEXT: vmovaps %xmm1, 80(%rsi)
3246 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3247 ; AVX-NEXT: vmovaps %xmm1, 112(%rsi)
3248 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3249 ; AVX-NEXT: vmovaps %xmm1, 96(%rsi)
3250 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3251 ; AVX-NEXT: vmovaps %xmm1, 32(%rsi)
3252 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3253 ; AVX-NEXT: vmovaps %xmm1, 48(%rsi)
3254 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3255 ; AVX-NEXT: vmovaps %xmm1, (%rdx)
3256 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3257 ; AVX-NEXT: vmovaps %xmm1, 16(%rdx)
3258 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3259 ; AVX-NEXT: vmovaps %xmm1, 64(%rdx)
3260 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3261 ; AVX-NEXT: vmovaps %xmm1, 80(%rdx)
3262 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3263 ; AVX-NEXT: vmovaps %xmm1, 96(%rdx)
3264 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3265 ; AVX-NEXT: vmovaps %xmm1, 112(%rdx)
3266 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3267 ; AVX-NEXT: vmovaps %xmm1, 32(%rdx)
3268 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3269 ; AVX-NEXT: vmovaps %xmm1, 48(%rdx)
3270 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3271 ; AVX-NEXT: vmovaps %ymm1, 64(%rcx)
3272 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3273 ; AVX-NEXT: vmovaps %ymm1, (%rcx)
3274 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3275 ; AVX-NEXT: vmovaps %ymm1, 96(%rcx)
3276 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3277 ; AVX-NEXT: vmovaps %ymm1, 32(%rcx)
3278 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3279 ; AVX-NEXT: vmovaps %ymm1, 64(%r8)
3280 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3281 ; AVX-NEXT: vmovaps %ymm1, (%r8)
3282 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3283 ; AVX-NEXT: vmovaps %ymm1, 96(%r8)
3284 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3285 ; AVX-NEXT: vmovaps %ymm1, 32(%r8)
3286 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3287 ; AVX-NEXT: vmovaps %xmm1, 80(%r9)
3288 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3289 ; AVX-NEXT: vmovaps %xmm1, 64(%r9)
3290 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3291 ; AVX-NEXT: vmovaps %xmm1, (%r9)
3292 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3293 ; AVX-NEXT: vmovaps %xmm1, 16(%r9)
3294 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3295 ; AVX-NEXT: vmovaps %xmm1, 112(%r9)
3296 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3297 ; AVX-NEXT: vmovaps %xmm1, 96(%r9)
3298 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3299 ; AVX-NEXT: vmovaps %xmm1, 32(%r9)
3300 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3301 ; AVX-NEXT: vmovaps %xmm1, 48(%r9)
3302 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
3303 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3304 ; AVX-NEXT: vmovaps %xmm1, 64(%rax)
3305 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3306 ; AVX-NEXT: vmovaps %xmm1, 80(%rax)
3307 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3308 ; AVX-NEXT: vmovaps %xmm1, (%rax)
3309 ; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
3310 ; AVX-NEXT: vmovaps %xmm1, 16(%rax)
3311 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3312 ; AVX-NEXT: vmovaps %xmm1, 96(%rax)
3313 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3314 ; AVX-NEXT: vmovaps %xmm1, 112(%rax)
3315 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3316 ; AVX-NEXT: vmovaps %xmm1, 32(%rax)
3317 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3318 ; AVX-NEXT: vmovaps %xmm1, 48(%rax)
3319 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
3320 ; AVX-NEXT: vmovaps %ymm8, 64(%rax)
3321 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3322 ; AVX-NEXT: vmovaps %ymm1, (%rax)
3323 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3324 ; AVX-NEXT: vmovaps %ymm1, 96(%rax)
3325 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3326 ; AVX-NEXT: vmovaps %ymm1, 32(%rax)
3327 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
3328 ; AVX-NEXT: vmovaps %ymm0, 96(%rax)
3329 ; AVX-NEXT: vmovaps %ymm4, 64(%rax)
3330 ; AVX-NEXT: vmovaps %ymm6, 32(%rax)
3331 ; AVX-NEXT: vmovaps %ymm9, (%rax)
3332 ; AVX-NEXT: addq $808, %rsp # imm = 0x328
3333 ; AVX-NEXT: vzeroupper
3336 ; AVX2-LABEL: load_i64_stride8_vf16:
3338 ; AVX2-NEXT: subq $808, %rsp # imm = 0x328
3339 ; AVX2-NEXT: vmovaps 832(%rdi), %ymm1
3340 ; AVX2-NEXT: vmovaps 768(%rdi), %ymm2
3341 ; AVX2-NEXT: vmovaps 320(%rdi), %ymm3
3342 ; AVX2-NEXT: vmovaps 256(%rdi), %ymm4
3343 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3344 ; AVX2-NEXT: vmovaps 448(%rdi), %ymm5
3345 ; AVX2-NEXT: vmovaps 384(%rdi), %ymm13
3346 ; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3347 ; AVX2-NEXT: vmovaps 320(%rdi), %xmm6
3348 ; AVX2-NEXT: vinsertf128 $1, 448(%rdi), %ymm6, %ymm6
3349 ; AVX2-NEXT: vmovaps 256(%rdi), %xmm7
3350 ; AVX2-NEXT: vinsertf128 $1, 384(%rdi), %ymm7, %ymm7
3351 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm6[0],ymm7[2],ymm6[2]
3352 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3353 ; AVX2-NEXT: vmovaps 832(%rdi), %xmm8
3354 ; AVX2-NEXT: vinsertf128 $1, 960(%rdi), %ymm8, %ymm8
3355 ; AVX2-NEXT: vmovaps 768(%rdi), %xmm9
3356 ; AVX2-NEXT: vinsertf128 $1, 896(%rdi), %ymm9, %ymm9
3357 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
3358 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3359 ; AVX2-NEXT: vmovaps (%rdi), %xmm10
3360 ; AVX2-NEXT: vmovaps 64(%rdi), %xmm11
3361 ; AVX2-NEXT: vinsertf128 $1, 192(%rdi), %ymm11, %ymm11
3362 ; AVX2-NEXT: vinsertf128 $1, 128(%rdi), %ymm10, %ymm10
3363 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm11[0],ymm10[2],ymm11[2]
3364 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3365 ; AVX2-NEXT: vmovaps 576(%rdi), %xmm12
3366 ; AVX2-NEXT: vinsertf128 $1, 704(%rdi), %ymm12, %ymm12
3367 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm6[1],ymm7[3],ymm6[3]
3368 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3369 ; AVX2-NEXT: vmovaps 512(%rdi), %xmm6
3370 ; AVX2-NEXT: vinsertf128 $1, 640(%rdi), %ymm6, %ymm6
3371 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
3372 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3373 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm12[0],ymm6[2],ymm12[2]
3374 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3375 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm11[1],ymm10[3],ymm11[3]
3376 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3377 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm12[1],ymm6[3],ymm12[3]
3378 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3379 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],ymm5[0],ymm13[2],ymm5[2]
3380 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
3381 ; AVX2-NEXT: vmovaps %ymm3, %ymm4
3382 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm6[2,3]
3383 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3384 ; AVX2-NEXT: vmovaps 960(%rdi), %ymm6
3385 ; AVX2-NEXT: vmovaps 896(%rdi), %ymm7
3386 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2]
3387 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
3388 ; AVX2-NEXT: vmovaps %ymm2, %ymm3
3389 ; AVX2-NEXT: vmovaps %ymm1, %ymm2
3390 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm8[2,3]
3391 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3392 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm8
3393 ; AVX2-NEXT: vmovaps (%rdi), %ymm9
3394 ; AVX2-NEXT: vmovaps 192(%rdi), %ymm10
3395 ; AVX2-NEXT: vmovaps 128(%rdi), %ymm11
3396 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2]
3397 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
3398 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm12[2,3]
3399 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3400 ; AVX2-NEXT: vmovaps 576(%rdi), %ymm12
3401 ; AVX2-NEXT: vmovaps 512(%rdi), %ymm13
3402 ; AVX2-NEXT: vmovaps 704(%rdi), %ymm14
3403 ; AVX2-NEXT: vmovaps 640(%rdi), %ymm15
3404 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
3405 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[2],ymm12[2]
3406 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
3407 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3408 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3409 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3]
3410 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3411 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3]
3412 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
3413 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3414 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm6[1],ymm7[3],ymm6[3]
3415 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
3416 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
3417 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3418 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm10[1],ymm11[3],ymm10[3]
3419 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
3420 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
3421 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3422 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
3423 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm13[1],ymm12[1],ymm13[3],ymm12[3]
3424 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
3425 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3426 ; AVX2-NEXT: vmovaps 352(%rdi), %xmm0
3427 ; AVX2-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm0
3428 ; AVX2-NEXT: vmovaps 288(%rdi), %xmm1
3429 ; AVX2-NEXT: vinsertf128 $1, 416(%rdi), %ymm1, %ymm1
3430 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
3431 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3432 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
3433 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3434 ; AVX2-NEXT: vmovaps 864(%rdi), %xmm0
3435 ; AVX2-NEXT: vinsertf128 $1, 992(%rdi), %ymm0, %ymm0
3436 ; AVX2-NEXT: vmovaps 800(%rdi), %xmm1
3437 ; AVX2-NEXT: vinsertf128 $1, 928(%rdi), %ymm1, %ymm1
3438 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
3439 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3440 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
3441 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3442 ; AVX2-NEXT: vmovaps 96(%rdi), %xmm0
3443 ; AVX2-NEXT: vinsertf128 $1, 224(%rdi), %ymm0, %ymm0
3444 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm1
3445 ; AVX2-NEXT: vinsertf128 $1, 160(%rdi), %ymm1, %ymm1
3446 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
3447 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3448 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
3449 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3450 ; AVX2-NEXT: vmovaps 608(%rdi), %xmm0
3451 ; AVX2-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0
3452 ; AVX2-NEXT: vmovaps 544(%rdi), %xmm1
3453 ; AVX2-NEXT: vinsertf128 $1, 672(%rdi), %ymm1, %ymm1
3454 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
3455 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3456 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
3457 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3458 ; AVX2-NEXT: vmovaps 352(%rdi), %ymm0
3459 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3460 ; AVX2-NEXT: vmovaps 288(%rdi), %ymm15
3461 ; AVX2-NEXT: vmovaps 480(%rdi), %ymm12
3462 ; AVX2-NEXT: vmovaps 416(%rdi), %ymm10
3463 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm10[0],ymm12[0],ymm10[2],ymm12[2]
3464 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm15[0],ymm0[0],ymm15[2],ymm0[2]
3465 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm4[2,3]
3466 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3467 ; AVX2-NEXT: vmovaps 864(%rdi), %ymm0
3468 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3469 ; AVX2-NEXT: vmovaps 800(%rdi), %ymm14
3470 ; AVX2-NEXT: vmovaps 992(%rdi), %ymm11
3471 ; AVX2-NEXT: vmovaps 928(%rdi), %ymm7
3472 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm11[0],ymm7[2],ymm11[2]
3473 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm14[0],ymm0[0],ymm14[2],ymm0[2]
3474 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm8[2,3]
3475 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3476 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm9
3477 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm6
3478 ; AVX2-NEXT: vmovaps 224(%rdi), %ymm3
3479 ; AVX2-NEXT: vmovaps 160(%rdi), %ymm2
3480 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
3481 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2]
3482 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm8[2,3]
3483 ; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
3484 ; AVX2-NEXT: vmovaps 608(%rdi), %ymm5
3485 ; AVX2-NEXT: vmovaps 544(%rdi), %ymm4
3486 ; AVX2-NEXT: vmovaps 736(%rdi), %ymm1
3487 ; AVX2-NEXT: vmovaps 672(%rdi), %ymm0
3488 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
3489 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
3490 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm13[2,3],ymm8[2,3]
3491 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
3492 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm6[1],ymm9[1],ymm6[3],ymm9[3]
3493 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm3[2,3],ymm2[2,3]
3494 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm12[1],ymm10[3],ymm12[3]
3495 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload
3496 ; AVX2-NEXT: # ymm3 = ymm15[1],mem[1],ymm15[3],mem[3]
3497 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3]
3498 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
3499 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
3500 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
3501 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm11[1],ymm7[3],ymm11[3]
3502 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload
3503 ; AVX2-NEXT: # ymm3 = ymm14[1],mem[1],ymm14[3],mem[3]
3504 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
3505 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3506 ; AVX2-NEXT: vmovaps %ymm3, 64(%rsi)
3507 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3508 ; AVX2-NEXT: vmovaps %ymm3, (%rsi)
3509 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3510 ; AVX2-NEXT: vmovaps %ymm3, 96(%rsi)
3511 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3512 ; AVX2-NEXT: vmovaps %ymm3, 32(%rsi)
3513 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3514 ; AVX2-NEXT: vmovaps %ymm3, 64(%rdx)
3515 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3516 ; AVX2-NEXT: vmovaps %ymm3, (%rdx)
3517 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3518 ; AVX2-NEXT: vmovaps %ymm3, 96(%rdx)
3519 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3520 ; AVX2-NEXT: vmovaps %ymm3, 32(%rdx)
3521 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3522 ; AVX2-NEXT: vmovaps %ymm3, 64(%rcx)
3523 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3524 ; AVX2-NEXT: vmovaps %ymm3, (%rcx)
3525 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3526 ; AVX2-NEXT: vmovaps %ymm3, 96(%rcx)
3527 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3528 ; AVX2-NEXT: vmovaps %ymm3, 32(%rcx)
3529 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3530 ; AVX2-NEXT: vmovaps %ymm3, 64(%r8)
3531 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3532 ; AVX2-NEXT: vmovaps %ymm3, (%r8)
3533 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3534 ; AVX2-NEXT: vmovaps %ymm3, 96(%r8)
3535 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3536 ; AVX2-NEXT: vmovaps %ymm3, 32(%r8)
3537 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3538 ; AVX2-NEXT: vmovaps %ymm3, 64(%r9)
3539 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3540 ; AVX2-NEXT: vmovaps %ymm3, (%r9)
3541 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3542 ; AVX2-NEXT: vmovaps %ymm3, 96(%r9)
3543 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3544 ; AVX2-NEXT: vmovaps %ymm3, 32(%r9)
3545 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
3546 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3547 ; AVX2-NEXT: vmovaps %ymm3, 64(%rax)
3548 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3549 ; AVX2-NEXT: vmovaps %ymm3, (%rax)
3550 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3551 ; AVX2-NEXT: vmovaps %ymm3, 96(%rax)
3552 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3553 ; AVX2-NEXT: vmovaps %ymm3, 32(%rax)
3554 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
3555 ; AVX2-NEXT: vmovaps %ymm8, 64(%rax)
3556 ; AVX2-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload
3557 ; AVX2-NEXT: vmovaps %ymm3, (%rax)
3558 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3559 ; AVX2-NEXT: vmovaps %ymm3, 96(%rax)
3560 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3561 ; AVX2-NEXT: vmovaps %ymm3, 32(%rax)
3562 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
3563 ; AVX2-NEXT: vmovaps %ymm1, 96(%rax)
3564 ; AVX2-NEXT: vmovaps %ymm0, 64(%rax)
3565 ; AVX2-NEXT: vmovaps %ymm2, 32(%rax)
3566 ; AVX2-NEXT: vmovaps %ymm9, (%rax)
3567 ; AVX2-NEXT: addq $808, %rsp # imm = 0x328
3568 ; AVX2-NEXT: vzeroupper
3571 ; AVX2-FP-LABEL: load_i64_stride8_vf16:
3573 ; AVX2-FP-NEXT: subq $808, %rsp # imm = 0x328
3574 ; AVX2-FP-NEXT: vmovaps 832(%rdi), %ymm1
3575 ; AVX2-FP-NEXT: vmovaps 768(%rdi), %ymm2
3576 ; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm3
3577 ; AVX2-FP-NEXT: vmovaps 256(%rdi), %ymm4
3578 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3579 ; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm5
3580 ; AVX2-FP-NEXT: vmovaps 384(%rdi), %ymm13
3581 ; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3582 ; AVX2-FP-NEXT: vmovaps 320(%rdi), %xmm6
3583 ; AVX2-FP-NEXT: vinsertf128 $1, 448(%rdi), %ymm6, %ymm6
3584 ; AVX2-FP-NEXT: vmovaps 256(%rdi), %xmm7
3585 ; AVX2-FP-NEXT: vinsertf128 $1, 384(%rdi), %ymm7, %ymm7
3586 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm6[0],ymm7[2],ymm6[2]
3587 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3588 ; AVX2-FP-NEXT: vmovaps 832(%rdi), %xmm8
3589 ; AVX2-FP-NEXT: vinsertf128 $1, 960(%rdi), %ymm8, %ymm8
3590 ; AVX2-FP-NEXT: vmovaps 768(%rdi), %xmm9
3591 ; AVX2-FP-NEXT: vinsertf128 $1, 896(%rdi), %ymm9, %ymm9
3592 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
3593 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3594 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm10
3595 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm11
3596 ; AVX2-FP-NEXT: vinsertf128 $1, 192(%rdi), %ymm11, %ymm11
3597 ; AVX2-FP-NEXT: vinsertf128 $1, 128(%rdi), %ymm10, %ymm10
3598 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm11[0],ymm10[2],ymm11[2]
3599 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3600 ; AVX2-FP-NEXT: vmovaps 576(%rdi), %xmm12
3601 ; AVX2-FP-NEXT: vinsertf128 $1, 704(%rdi), %ymm12, %ymm12
3602 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm6[1],ymm7[3],ymm6[3]
3603 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3604 ; AVX2-FP-NEXT: vmovaps 512(%rdi), %xmm6
3605 ; AVX2-FP-NEXT: vinsertf128 $1, 640(%rdi), %ymm6, %ymm6
3606 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
3607 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3608 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm12[0],ymm6[2],ymm12[2]
3609 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3610 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm11[1],ymm10[3],ymm11[3]
3611 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3612 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm12[1],ymm6[3],ymm12[3]
3613 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3614 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],ymm5[0],ymm13[2],ymm5[2]
3615 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
3616 ; AVX2-FP-NEXT: vmovaps %ymm3, %ymm4
3617 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm6[2,3]
3618 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3619 ; AVX2-FP-NEXT: vmovaps 960(%rdi), %ymm6
3620 ; AVX2-FP-NEXT: vmovaps 896(%rdi), %ymm7
3621 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2]
3622 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
3623 ; AVX2-FP-NEXT: vmovaps %ymm2, %ymm3
3624 ; AVX2-FP-NEXT: vmovaps %ymm1, %ymm2
3625 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm8[2,3]
3626 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3627 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm8
3628 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm9
3629 ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm10
3630 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm11
3631 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2]
3632 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
3633 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm12[2,3]
3634 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3635 ; AVX2-FP-NEXT: vmovaps 576(%rdi), %ymm12
3636 ; AVX2-FP-NEXT: vmovaps 512(%rdi), %ymm13
3637 ; AVX2-FP-NEXT: vmovaps 704(%rdi), %ymm14
3638 ; AVX2-FP-NEXT: vmovaps 640(%rdi), %ymm15
3639 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
3640 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[2],ymm12[2]
3641 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
3642 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3643 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3644 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3]
3645 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3646 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3]
3647 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
3648 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3649 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm6[1],ymm7[3],ymm6[3]
3650 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
3651 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
3652 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3653 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm10[1],ymm11[3],ymm10[3]
3654 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
3655 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
3656 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3657 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
3658 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm13[1],ymm12[1],ymm13[3],ymm12[3]
3659 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
3660 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3661 ; AVX2-FP-NEXT: vmovaps 352(%rdi), %xmm0
3662 ; AVX2-FP-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm0
3663 ; AVX2-FP-NEXT: vmovaps 288(%rdi), %xmm1
3664 ; AVX2-FP-NEXT: vinsertf128 $1, 416(%rdi), %ymm1, %ymm1
3665 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
3666 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3667 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
3668 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3669 ; AVX2-FP-NEXT: vmovaps 864(%rdi), %xmm0
3670 ; AVX2-FP-NEXT: vinsertf128 $1, 992(%rdi), %ymm0, %ymm0
3671 ; AVX2-FP-NEXT: vmovaps 800(%rdi), %xmm1
3672 ; AVX2-FP-NEXT: vinsertf128 $1, 928(%rdi), %ymm1, %ymm1
3673 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
3674 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3675 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
3676 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3677 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm0
3678 ; AVX2-FP-NEXT: vinsertf128 $1, 224(%rdi), %ymm0, %ymm0
3679 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm1
3680 ; AVX2-FP-NEXT: vinsertf128 $1, 160(%rdi), %ymm1, %ymm1
3681 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
3682 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3683 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
3684 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3685 ; AVX2-FP-NEXT: vmovaps 608(%rdi), %xmm0
3686 ; AVX2-FP-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0
3687 ; AVX2-FP-NEXT: vmovaps 544(%rdi), %xmm1
3688 ; AVX2-FP-NEXT: vinsertf128 $1, 672(%rdi), %ymm1, %ymm1
3689 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
3690 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3691 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
3692 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3693 ; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm0
3694 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3695 ; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm15
3696 ; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm12
3697 ; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm10
3698 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm10[0],ymm12[0],ymm10[2],ymm12[2]
3699 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm15[0],ymm0[0],ymm15[2],ymm0[2]
3700 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm4[2,3]
3701 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3702 ; AVX2-FP-NEXT: vmovaps 864(%rdi), %ymm0
3703 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3704 ; AVX2-FP-NEXT: vmovaps 800(%rdi), %ymm14
3705 ; AVX2-FP-NEXT: vmovaps 992(%rdi), %ymm11
3706 ; AVX2-FP-NEXT: vmovaps 928(%rdi), %ymm7
3707 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm11[0],ymm7[2],ymm11[2]
3708 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm14[0],ymm0[0],ymm14[2],ymm0[2]
3709 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm8[2,3]
3710 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3711 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm9
3712 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm6
3713 ; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm3
3714 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm2
3715 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
3716 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2]
3717 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm8[2,3]
3718 ; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
3719 ; AVX2-FP-NEXT: vmovaps 608(%rdi), %ymm5
3720 ; AVX2-FP-NEXT: vmovaps 544(%rdi), %ymm4
3721 ; AVX2-FP-NEXT: vmovaps 736(%rdi), %ymm1
3722 ; AVX2-FP-NEXT: vmovaps 672(%rdi), %ymm0
3723 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
3724 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
3725 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm13[2,3],ymm8[2,3]
3726 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
3727 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm6[1],ymm9[1],ymm6[3],ymm9[3]
3728 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm3[2,3],ymm2[2,3]
3729 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm12[1],ymm10[3],ymm12[3]
3730 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload
3731 ; AVX2-FP-NEXT: # ymm3 = ymm15[1],mem[1],ymm15[3],mem[3]
3732 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3]
3733 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
3734 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
3735 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
3736 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm11[1],ymm7[3],ymm11[3]
3737 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload
3738 ; AVX2-FP-NEXT: # ymm3 = ymm14[1],mem[1],ymm14[3],mem[3]
3739 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
3740 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3741 ; AVX2-FP-NEXT: vmovaps %ymm3, 64(%rsi)
3742 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3743 ; AVX2-FP-NEXT: vmovaps %ymm3, (%rsi)
3744 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3745 ; AVX2-FP-NEXT: vmovaps %ymm3, 96(%rsi)
3746 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3747 ; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rsi)
3748 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3749 ; AVX2-FP-NEXT: vmovaps %ymm3, 64(%rdx)
3750 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3751 ; AVX2-FP-NEXT: vmovaps %ymm3, (%rdx)
3752 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3753 ; AVX2-FP-NEXT: vmovaps %ymm3, 96(%rdx)
3754 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3755 ; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rdx)
3756 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3757 ; AVX2-FP-NEXT: vmovaps %ymm3, 64(%rcx)
3758 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3759 ; AVX2-FP-NEXT: vmovaps %ymm3, (%rcx)
3760 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3761 ; AVX2-FP-NEXT: vmovaps %ymm3, 96(%rcx)
3762 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3763 ; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rcx)
3764 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3765 ; AVX2-FP-NEXT: vmovaps %ymm3, 64(%r8)
3766 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3767 ; AVX2-FP-NEXT: vmovaps %ymm3, (%r8)
3768 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3769 ; AVX2-FP-NEXT: vmovaps %ymm3, 96(%r8)
3770 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3771 ; AVX2-FP-NEXT: vmovaps %ymm3, 32(%r8)
3772 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3773 ; AVX2-FP-NEXT: vmovaps %ymm3, 64(%r9)
3774 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3775 ; AVX2-FP-NEXT: vmovaps %ymm3, (%r9)
3776 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3777 ; AVX2-FP-NEXT: vmovaps %ymm3, 96(%r9)
3778 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3779 ; AVX2-FP-NEXT: vmovaps %ymm3, 32(%r9)
3780 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3781 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3782 ; AVX2-FP-NEXT: vmovaps %ymm3, 64(%rax)
3783 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3784 ; AVX2-FP-NEXT: vmovaps %ymm3, (%rax)
3785 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3786 ; AVX2-FP-NEXT: vmovaps %ymm3, 96(%rax)
3787 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3788 ; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rax)
3789 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3790 ; AVX2-FP-NEXT: vmovaps %ymm8, 64(%rax)
3791 ; AVX2-FP-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload
3792 ; AVX2-FP-NEXT: vmovaps %ymm3, (%rax)
3793 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3794 ; AVX2-FP-NEXT: vmovaps %ymm3, 96(%rax)
3795 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3796 ; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rax)
3797 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3798 ; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rax)
3799 ; AVX2-FP-NEXT: vmovaps %ymm0, 64(%rax)
3800 ; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rax)
3801 ; AVX2-FP-NEXT: vmovaps %ymm9, (%rax)
3802 ; AVX2-FP-NEXT: addq $808, %rsp # imm = 0x328
3803 ; AVX2-FP-NEXT: vzeroupper
3804 ; AVX2-FP-NEXT: retq
3806 ; AVX2-FCP-LABEL: load_i64_stride8_vf16:
3807 ; AVX2-FCP: # %bb.0:
3808 ; AVX2-FCP-NEXT: subq $808, %rsp # imm = 0x328
3809 ; AVX2-FCP-NEXT: vmovaps 832(%rdi), %ymm1
3810 ; AVX2-FCP-NEXT: vmovaps 768(%rdi), %ymm2
3811 ; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm3
3812 ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm4
3813 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3814 ; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm5
3815 ; AVX2-FCP-NEXT: vmovaps 384(%rdi), %ymm13
3816 ; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3817 ; AVX2-FCP-NEXT: vmovaps 320(%rdi), %xmm6
3818 ; AVX2-FCP-NEXT: vinsertf128 $1, 448(%rdi), %ymm6, %ymm6
3819 ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %xmm7
3820 ; AVX2-FCP-NEXT: vinsertf128 $1, 384(%rdi), %ymm7, %ymm7
3821 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm6[0],ymm7[2],ymm6[2]
3822 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3823 ; AVX2-FCP-NEXT: vmovaps 832(%rdi), %xmm8
3824 ; AVX2-FCP-NEXT: vinsertf128 $1, 960(%rdi), %ymm8, %ymm8
3825 ; AVX2-FCP-NEXT: vmovaps 768(%rdi), %xmm9
3826 ; AVX2-FCP-NEXT: vinsertf128 $1, 896(%rdi), %ymm9, %ymm9
3827 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
3828 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3829 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm10
3830 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm11
3831 ; AVX2-FCP-NEXT: vinsertf128 $1, 192(%rdi), %ymm11, %ymm11
3832 ; AVX2-FCP-NEXT: vinsertf128 $1, 128(%rdi), %ymm10, %ymm10
3833 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm11[0],ymm10[2],ymm11[2]
3834 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3835 ; AVX2-FCP-NEXT: vmovaps 576(%rdi), %xmm12
3836 ; AVX2-FCP-NEXT: vinsertf128 $1, 704(%rdi), %ymm12, %ymm12
3837 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm6[1],ymm7[3],ymm6[3]
3838 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3839 ; AVX2-FCP-NEXT: vmovaps 512(%rdi), %xmm6
3840 ; AVX2-FCP-NEXT: vinsertf128 $1, 640(%rdi), %ymm6, %ymm6
3841 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
3842 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3843 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm12[0],ymm6[2],ymm12[2]
3844 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3845 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm11[1],ymm10[3],ymm11[3]
3846 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3847 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm12[1],ymm6[3],ymm12[3]
3848 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3849 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],ymm5[0],ymm13[2],ymm5[2]
3850 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
3851 ; AVX2-FCP-NEXT: vmovaps %ymm3, %ymm4
3852 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm6[2,3]
3853 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3854 ; AVX2-FCP-NEXT: vmovaps 960(%rdi), %ymm6
3855 ; AVX2-FCP-NEXT: vmovaps 896(%rdi), %ymm7
3856 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2]
3857 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
3858 ; AVX2-FCP-NEXT: vmovaps %ymm2, %ymm3
3859 ; AVX2-FCP-NEXT: vmovaps %ymm1, %ymm2
3860 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm8[2,3]
3861 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3862 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm8
3863 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm9
3864 ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm10
3865 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm11
3866 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2]
3867 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
3868 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm12[2,3]
3869 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3870 ; AVX2-FCP-NEXT: vmovaps 576(%rdi), %ymm12
3871 ; AVX2-FCP-NEXT: vmovaps 512(%rdi), %ymm13
3872 ; AVX2-FCP-NEXT: vmovaps 704(%rdi), %ymm14
3873 ; AVX2-FCP-NEXT: vmovaps 640(%rdi), %ymm15
3874 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
3875 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[2],ymm12[2]
3876 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
3877 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3878 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3879 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3]
3880 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3881 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3]
3882 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
3883 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3884 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm6[1],ymm7[3],ymm6[3]
3885 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
3886 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
3887 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3888 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm10[1],ymm11[3],ymm10[3]
3889 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
3890 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
3891 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3892 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
3893 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm13[1],ymm12[1],ymm13[3],ymm12[3]
3894 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
3895 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3896 ; AVX2-FCP-NEXT: vmovaps 352(%rdi), %xmm0
3897 ; AVX2-FCP-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm0
3898 ; AVX2-FCP-NEXT: vmovaps 288(%rdi), %xmm1
3899 ; AVX2-FCP-NEXT: vinsertf128 $1, 416(%rdi), %ymm1, %ymm1
3900 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
3901 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3902 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
3903 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3904 ; AVX2-FCP-NEXT: vmovaps 864(%rdi), %xmm0
3905 ; AVX2-FCP-NEXT: vinsertf128 $1, 992(%rdi), %ymm0, %ymm0
3906 ; AVX2-FCP-NEXT: vmovaps 800(%rdi), %xmm1
3907 ; AVX2-FCP-NEXT: vinsertf128 $1, 928(%rdi), %ymm1, %ymm1
3908 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
3909 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3910 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
3911 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3912 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm0
3913 ; AVX2-FCP-NEXT: vinsertf128 $1, 224(%rdi), %ymm0, %ymm0
3914 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm1
3915 ; AVX2-FCP-NEXT: vinsertf128 $1, 160(%rdi), %ymm1, %ymm1
3916 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
3917 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3918 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
3919 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3920 ; AVX2-FCP-NEXT: vmovaps 608(%rdi), %xmm0
3921 ; AVX2-FCP-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0
3922 ; AVX2-FCP-NEXT: vmovaps 544(%rdi), %xmm1
3923 ; AVX2-FCP-NEXT: vinsertf128 $1, 672(%rdi), %ymm1, %ymm1
3924 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
3925 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3926 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
3927 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3928 ; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm0
3929 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3930 ; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm15
3931 ; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm12
3932 ; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm10
3933 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm10[0],ymm12[0],ymm10[2],ymm12[2]
3934 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm15[0],ymm0[0],ymm15[2],ymm0[2]
3935 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm4[2,3]
3936 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3937 ; AVX2-FCP-NEXT: vmovaps 864(%rdi), %ymm0
3938 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3939 ; AVX2-FCP-NEXT: vmovaps 800(%rdi), %ymm14
3940 ; AVX2-FCP-NEXT: vmovaps 992(%rdi), %ymm11
3941 ; AVX2-FCP-NEXT: vmovaps 928(%rdi), %ymm7
3942 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm11[0],ymm7[2],ymm11[2]
3943 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm14[0],ymm0[0],ymm14[2],ymm0[2]
3944 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm8[2,3]
3945 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3946 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm9
3947 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm6
3948 ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm3
3949 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm2
3950 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
3951 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2]
3952 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm8[2,3]
3953 ; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
3954 ; AVX2-FCP-NEXT: vmovaps 608(%rdi), %ymm5
3955 ; AVX2-FCP-NEXT: vmovaps 544(%rdi), %ymm4
3956 ; AVX2-FCP-NEXT: vmovaps 736(%rdi), %ymm1
3957 ; AVX2-FCP-NEXT: vmovaps 672(%rdi), %ymm0
3958 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
3959 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
3960 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm13[2,3],ymm8[2,3]
3961 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
3962 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm6[1],ymm9[1],ymm6[3],ymm9[3]
3963 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm3[2,3],ymm2[2,3]
3964 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm12[1],ymm10[3],ymm12[3]
3965 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload
3966 ; AVX2-FCP-NEXT: # ymm3 = ymm15[1],mem[1],ymm15[3],mem[3]
3967 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3]
3968 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
3969 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
3970 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
3971 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm11[1],ymm7[3],ymm11[3]
3972 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload
3973 ; AVX2-FCP-NEXT: # ymm3 = ymm14[1],mem[1],ymm14[3],mem[3]
3974 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
3975 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3976 ; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rsi)
3977 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3978 ; AVX2-FCP-NEXT: vmovaps %ymm3, (%rsi)
3979 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3980 ; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%rsi)
3981 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3982 ; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rsi)
3983 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3984 ; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rdx)
3985 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3986 ; AVX2-FCP-NEXT: vmovaps %ymm3, (%rdx)
3987 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3988 ; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%rdx)
3989 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3990 ; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rdx)
3991 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3992 ; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rcx)
3993 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3994 ; AVX2-FCP-NEXT: vmovaps %ymm3, (%rcx)
3995 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3996 ; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%rcx)
3997 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3998 ; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rcx)
3999 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4000 ; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%r8)
4001 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4002 ; AVX2-FCP-NEXT: vmovaps %ymm3, (%r8)
4003 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4004 ; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%r8)
4005 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4006 ; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%r8)
4007 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4008 ; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%r9)
4009 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4010 ; AVX2-FCP-NEXT: vmovaps %ymm3, (%r9)
4011 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4012 ; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%r9)
4013 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4014 ; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%r9)
4015 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
4016 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4017 ; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rax)
4018 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4019 ; AVX2-FCP-NEXT: vmovaps %ymm3, (%rax)
4020 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4021 ; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%rax)
4022 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4023 ; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rax)
4024 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
4025 ; AVX2-FCP-NEXT: vmovaps %ymm8, 64(%rax)
4026 ; AVX2-FCP-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload
4027 ; AVX2-FCP-NEXT: vmovaps %ymm3, (%rax)
4028 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4029 ; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%rax)
4030 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4031 ; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rax)
4032 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
4033 ; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rax)
4034 ; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rax)
4035 ; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rax)
4036 ; AVX2-FCP-NEXT: vmovaps %ymm9, (%rax)
4037 ; AVX2-FCP-NEXT: addq $808, %rsp # imm = 0x328
4038 ; AVX2-FCP-NEXT: vzeroupper
4039 ; AVX2-FCP-NEXT: retq
4041 ; AVX512-LABEL: load_i64_stride8_vf16:
4043 ; AVX512-NEXT: subq $264, %rsp # imm = 0x108
4044 ; AVX512-NEXT: vmovaps 576(%rdi), %zmm0
4045 ; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4046 ; AVX512-NEXT: vmovaps 512(%rdi), %zmm0
4047 ; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4048 ; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm6
4049 ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm4
4050 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4051 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm7
4052 ; AVX512-NEXT: vmovaps (%rdi), %zmm0
4053 ; AVX512-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill
4054 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm31
4055 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm8
4056 ; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm10
4057 ; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm14
4058 ; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm30
4059 ; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm28
4060 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm16
4061 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm5
4062 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm3
4063 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm15
4064 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [0,8,0,8,0,8,0,8]
4065 ; AVX512-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4066 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm17
4067 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm20
4068 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm9
4069 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm19
4070 ; AVX512-NEXT: vpermt2q %zmm3, %zmm29, %zmm19
4071 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm18
4072 ; AVX512-NEXT: vpermt2q %zmm16, %zmm29, %zmm18
4073 ; AVX512-NEXT: movb $-64, %al
4074 ; AVX512-NEXT: kmovw %eax, %k1
4075 ; AVX512-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1}
4076 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [2,10,2,10,2,10,2,10]
4077 ; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4078 ; AVX512-NEXT: vpermt2q %zmm3, %zmm19, %zmm17
4079 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm21
4080 ; AVX512-NEXT: vpermt2q %zmm16, %zmm19, %zmm21
4081 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1}
4082 ; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm22
4083 ; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm23
4084 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2]
4085 ; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm24
4086 ; AVX512-NEXT: vmovdqa64 (%rdi), %ymm25
4087 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2]
4088 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
4089 ; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm0
4090 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4091 ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm11
4092 ; AVX512-NEXT: vpermt2q %zmm30, %zmm19, %zmm11
4093 ; AVX512-NEXT: vpermi2q %zmm10, %zmm14, %zmm19
4094 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm19 {%k1}
4095 ; AVX512-NEXT: vmovdqa 704(%rdi), %ymm11
4096 ; AVX512-NEXT: vmovdqa 640(%rdi), %ymm12
4097 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
4098 ; AVX512-NEXT: vmovdqa64 576(%rdi), %ymm26
4099 ; AVX512-NEXT: vmovdqa64 512(%rdi), %ymm27
4100 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2]
4101 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3]
4102 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19
4103 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
4104 ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4105 ; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm20
4106 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13
4107 ; AVX512-NEXT: vpermt2q %zmm16, %zmm0, %zmm13
4108 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1}
4109 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9]
4110 ; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4111 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
4112 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm23
4113 ; AVX512-NEXT: vpermt2q %zmm16, %zmm21, %zmm23
4114 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3]
4115 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
4116 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm20
4117 ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm1
4118 ; AVX512-NEXT: vpermt2q %zmm30, %zmm0, %zmm1
4119 ; AVX512-NEXT: vpermi2q %zmm10, %zmm14, %zmm0
4120 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
4121 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
4122 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3]
4123 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
4124 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22
4125 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
4126 ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4127 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4128 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm1
4129 ; AVX512-NEXT: vpermt2q %zmm31, %zmm0, %zmm1
4130 ; AVX512-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
4131 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm2
4132 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm11
4133 ; AVX512-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
4134 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
4135 ; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm9
4136 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6]
4137 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1}
4138 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm24
4139 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1
4140 ; AVX512-NEXT: vpermt2q %zmm6, %zmm0, %zmm1
4141 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
4142 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm2
4143 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
4144 ; AVX512-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
4145 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
4146 ; AVX512-NEXT: vpermi2q %zmm30, %zmm28, %zmm0
4147 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm14[0],zmm10[0],zmm14[2],zmm10[2],zmm14[4],zmm10[4],zmm14[6],zmm10[6]
4148 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
4149 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm25
4150 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [5,13,5,13,5,13,5,13]
4151 ; AVX512-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4152 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm0
4153 ; AVX512-NEXT: vpermt2q %zmm31, %zmm27, %zmm0
4154 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm1
4155 ; AVX512-NEXT: vpermt2q %zmm11, %zmm27, %zmm1
4156 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm8
4157 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4158 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm5[1],zmm16[1],zmm5[3],zmm16[3],zmm5[5],zmm16[5],zmm5[7],zmm16[7]
4159 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
4160 ; AVX512-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4161 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1
4162 ; AVX512-NEXT: vpermt2q %zmm16, %zmm26, %zmm1
4163 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,7,15,7,15,7,15]
4164 ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4165 ; AVX512-NEXT: vpermt2q %zmm16, %zmm2, %zmm5
4166 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm12
4167 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm15[0],zmm3[0],zmm15[2],zmm3[2],zmm15[4],zmm3[4],zmm15[6],zmm3[6]
4168 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm15[1],zmm3[1],zmm15[3],zmm3[3],zmm15[5],zmm3[5],zmm15[7],zmm3[7]
4169 ; AVX512-NEXT: vpermt2q %zmm3, %zmm21, %zmm12
4170 ; AVX512-NEXT: vpermt2q %zmm3, %zmm27, %zmm15
4171 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm0 {%k1}
4172 ; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm17
4173 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
4174 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0
4175 ; AVX512-NEXT: vpermt2q %zmm6, %zmm27, %zmm0
4176 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm15
4177 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm9
4178 ; AVX512-NEXT: vpermt2q %zmm7, %zmm27, %zmm9
4179 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
4180 ; AVX512-NEXT: vpermi2q %zmm30, %zmm28, %zmm27
4181 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm14[1],zmm10[1],zmm14[3],zmm10[3],zmm14[5],zmm10[5],zmm14[7],zmm10[7]
4182 ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm9 {%k1}
4183 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm27
4184 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
4185 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm0
4186 ; AVX512-NEXT: vpermt2q %zmm31, %zmm26, %zmm0
4187 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm9
4188 ; AVX512-NEXT: vpermt2q %zmm8, %zmm26, %zmm9
4189 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
4190 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm16
4191 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
4192 ; AVX512-NEXT: vpermt2q %zmm31, %zmm2, %zmm4
4193 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4194 ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm1
4195 ; AVX512-NEXT: vpermt2q %zmm30, %zmm29, %zmm1
4196 ; AVX512-NEXT: vpermi2q %zmm10, %zmm14, %zmm29
4197 ; AVX512-NEXT: vpermt2q %zmm8, %zmm2, %zmm13
4198 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm4
4199 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm9
4200 ; AVX512-NEXT: vpermt2q %zmm6, %zmm26, %zmm9
4201 ; AVX512-NEXT: vpermt2q %zmm6, %zmm2, %zmm11
4202 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm8
4203 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm13
4204 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm11
4205 ; AVX512-NEXT: vpermt2q %zmm7, %zmm26, %zmm11
4206 ; AVX512-NEXT: vpermt2q %zmm7, %zmm2, %zmm13
4207 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm15
4208 ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm13
4209 ; AVX512-NEXT: vpermt2q %zmm30, %zmm21, %zmm13
4210 ; AVX512-NEXT: vpermi2q %zmm10, %zmm14, %zmm21
4211 ; AVX512-NEXT: vpermi2q %zmm10, %zmm14, %zmm26
4212 ; AVX512-NEXT: vpermt2q %zmm10, %zmm2, %zmm14
4213 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm2
4214 ; AVX512-NEXT: vinserti128 $1, 192(%rdi), %ymm2, %ymm2
4215 ; AVX512-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0
4216 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
4217 ; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10
4218 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1}
4219 ; AVX512-NEXT: vmovdqa 576(%rdi), %xmm1
4220 ; AVX512-NEXT: vinserti128 $1, 704(%rdi), %ymm1, %ymm1
4221 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k1} = zmm28[0],zmm30[0],zmm28[2],zmm30[2],zmm28[4],zmm30[4],zmm28[6],zmm30[6]
4222 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm28[1],zmm30[1],zmm28[3],zmm30[3],zmm28[5],zmm30[5],zmm28[7],zmm30[7]
4223 ; AVX512-NEXT: vmovdqa 512(%rdi), %xmm6
4224 ; AVX512-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6
4225 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm1[0],ymm6[2],ymm1[2]
4226 ; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7
4227 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm23 {%k1}
4228 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
4229 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0
4230 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm21 {%k1}
4231 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3]
4232 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm1
4233 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm9[4,5,6,7]
4234 ; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm2
4235 ; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
4236 ; AVX512-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
4237 ; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4
4238 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm8[4,5,6,7]
4239 ; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm3
4240 ; AVX512-NEXT: vmovdqa64 %zmm7, 64(%rsi)
4241 ; AVX512-NEXT: vmovdqa64 %zmm10, (%rsi)
4242 ; AVX512-NEXT: vmovdqa64 %zmm1, 64(%rdx)
4243 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
4244 ; AVX512-NEXT: vmovdqa64 %zmm19, 64(%rcx)
4245 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
4246 ; AVX512-NEXT: vmovaps %zmm0, (%rcx)
4247 ; AVX512-NEXT: vmovdqa64 %zmm22, 64(%r8)
4248 ; AVX512-NEXT: vmovdqa64 %zmm20, (%r8)
4249 ; AVX512-NEXT: vmovdqa64 %zmm25, 64(%r9)
4250 ; AVX512-NEXT: vmovdqa64 %zmm24, (%r9)
4251 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
4252 ; AVX512-NEXT: vmovdqa64 %zmm27, 64(%rax)
4253 ; AVX512-NEXT: vmovdqa64 %zmm17, (%rax)
4254 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
4255 ; AVX512-NEXT: vmovdqa64 %zmm2, 64(%rax)
4256 ; AVX512-NEXT: vmovdqa64 %zmm16, (%rax)
4257 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
4258 ; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rax)
4259 ; AVX512-NEXT: vmovdqa64 %zmm4, (%rax)
4260 ; AVX512-NEXT: addq $264, %rsp # imm = 0x108
4261 ; AVX512-NEXT: vzeroupper
4264 ; AVX512-FCP-LABEL: load_i64_stride8_vf16:
4265 ; AVX512-FCP: # %bb.0:
4266 ; AVX512-FCP-NEXT: subq $264, %rsp # imm = 0x108
4267 ; AVX512-FCP-NEXT: vmovaps 576(%rdi), %zmm0
4268 ; AVX512-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4269 ; AVX512-FCP-NEXT: vmovaps 512(%rdi), %zmm0
4270 ; AVX512-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4271 ; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm6
4272 ; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4
4273 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4274 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm7
4275 ; AVX512-FCP-NEXT: vmovaps (%rdi), %zmm0
4276 ; AVX512-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill
4277 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm31
4278 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8
4279 ; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm10
4280 ; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm14
4281 ; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm30
4282 ; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm28
4283 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm16
4284 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5
4285 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm3
4286 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm15
4287 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [0,8,0,8,0,8,0,8]
4288 ; AVX512-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4289 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm17
4290 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm20
4291 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm9
4292 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm19
4293 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm29, %zmm19
4294 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm18
4295 ; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm29, %zmm18
4296 ; AVX512-FCP-NEXT: movb $-64, %al
4297 ; AVX512-FCP-NEXT: kmovw %eax, %k1
4298 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1}
4299 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [2,10,2,10,2,10,2,10]
4300 ; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4301 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm17
4302 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm21
4303 ; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm19, %zmm21
4304 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1}
4305 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm22
4306 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23
4307 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2]
4308 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm24
4309 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm25
4310 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2]
4311 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
4312 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm0
4313 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4314 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm11
4315 ; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm19, %zmm11
4316 ; AVX512-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm19
4317 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 {%k1}
4318 ; AVX512-FCP-NEXT: vmovdqa 704(%rdi), %ymm11
4319 ; AVX512-FCP-NEXT: vmovdqa 640(%rdi), %ymm12
4320 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
4321 ; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %ymm26
4322 ; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %ymm27
4323 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2]
4324 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3]
4325 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19
4326 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
4327 ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4328 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm20
4329 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
4330 ; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm13
4331 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1}
4332 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9]
4333 ; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4334 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
4335 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm23
4336 ; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm21, %zmm23
4337 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3]
4338 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
4339 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm20
4340 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm1
4341 ; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm1
4342 ; AVX512-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm0
4343 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
4344 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
4345 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3]
4346 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
4347 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22
4348 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
4349 ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4350 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4351 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
4352 ; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm1
4353 ; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
4354 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm2
4355 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm11
4356 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
4357 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
4358 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm9
4359 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6]
4360 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1}
4361 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm24
4362 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1
4363 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm1
4364 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
4365 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm2
4366 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
4367 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
4368 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
4369 ; AVX512-FCP-NEXT: vpermi2q %zmm30, %zmm28, %zmm0
4370 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm14[0],zmm10[0],zmm14[2],zmm10[2],zmm14[4],zmm10[4],zmm14[6],zmm10[6]
4371 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
4372 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm25
4373 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [5,13,5,13,5,13,5,13]
4374 ; AVX512-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4375 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm0
4376 ; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm27, %zmm0
4377 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm1
4378 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm27, %zmm1
4379 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm8
4380 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4381 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm5[1],zmm16[1],zmm5[3],zmm16[3],zmm5[5],zmm16[5],zmm5[7],zmm16[7]
4382 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
4383 ; AVX512-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4384 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1
4385 ; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm26, %zmm1
4386 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,7,15,7,15,7,15]
4387 ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4388 ; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm5
4389 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm12
4390 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm15[0],zmm3[0],zmm15[2],zmm3[2],zmm15[4],zmm3[4],zmm15[6],zmm3[6]
4391 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm15[1],zmm3[1],zmm15[3],zmm3[3],zmm15[5],zmm3[5],zmm15[7],zmm3[7]
4392 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm12
4393 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm15
4394 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 {%k1}
4395 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm17
4396 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
4397 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0
4398 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm27, %zmm0
4399 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm15
4400 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm9
4401 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm27, %zmm9
4402 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
4403 ; AVX512-FCP-NEXT: vpermi2q %zmm30, %zmm28, %zmm27
4404 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm14[1],zmm10[1],zmm14[3],zmm10[3],zmm14[5],zmm10[5],zmm14[7],zmm10[7]
4405 ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm9 {%k1}
4406 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm27
4407 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
4408 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm0
4409 ; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm26, %zmm0
4410 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm9
4411 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm26, %zmm9
4412 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
4413 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm16
4414 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
4415 ; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm2, %zmm4
4416 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4417 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm1
4418 ; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm29, %zmm1
4419 ; AVX512-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm29
4420 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm13
4421 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm4
4422 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm9
4423 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm9
4424 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm11
4425 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm8
4426 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm13
4427 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm11
4428 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm26, %zmm11
4429 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm13
4430 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm15
4431 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm13
4432 ; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm13
4433 ; AVX512-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm21
4434 ; AVX512-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm26
4435 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm14
4436 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm2
4437 ; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm2, %ymm2
4438 ; AVX512-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0
4439 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
4440 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10
4441 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1}
4442 ; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %xmm1
4443 ; AVX512-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm1, %ymm1
4444 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k1} = zmm28[0],zmm30[0],zmm28[2],zmm30[2],zmm28[4],zmm30[4],zmm28[6],zmm30[6]
4445 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm28[1],zmm30[1],zmm28[3],zmm30[3],zmm28[5],zmm30[5],zmm28[7],zmm30[7]
4446 ; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %xmm6
4447 ; AVX512-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6
4448 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm1[0],ymm6[2],ymm1[2]
4449 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7
4450 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm23 {%k1}
4451 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
4452 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0
4453 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm21 {%k1}
4454 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3]
4455 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm1
4456 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm9[4,5,6,7]
4457 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm2
4458 ; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
4459 ; AVX512-FCP-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
4460 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4
4461 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm8[4,5,6,7]
4462 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm3
4463 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 64(%rsi)
4464 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rsi)
4465 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rdx)
4466 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
4467 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 64(%rcx)
4468 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
4469 ; AVX512-FCP-NEXT: vmovaps %zmm0, (%rcx)
4470 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8)
4471 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%r8)
4472 ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 64(%r9)
4473 ; AVX512-FCP-NEXT: vmovdqa64 %zmm24, (%r9)
4474 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
4475 ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 64(%rax)
4476 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rax)
4477 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
4478 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax)
4479 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%rax)
4480 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
4481 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax)
4482 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax)
4483 ; AVX512-FCP-NEXT: addq $264, %rsp # imm = 0x108
4484 ; AVX512-FCP-NEXT: vzeroupper
4485 ; AVX512-FCP-NEXT: retq
4487 ; AVX512DQ-LABEL: load_i64_stride8_vf16:
4488 ; AVX512DQ: # %bb.0:
4489 ; AVX512DQ-NEXT: subq $264, %rsp # imm = 0x108
4490 ; AVX512DQ-NEXT: vmovaps 576(%rdi), %zmm0
4491 ; AVX512DQ-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4492 ; AVX512DQ-NEXT: vmovaps 512(%rdi), %zmm0
4493 ; AVX512DQ-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4494 ; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm6
4495 ; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm4
4496 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4497 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm7
4498 ; AVX512DQ-NEXT: vmovaps (%rdi), %zmm0
4499 ; AVX512DQ-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill
4500 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm31
4501 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm8
4502 ; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm10
4503 ; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm14
4504 ; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm30
4505 ; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm28
4506 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm16
4507 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm5
4508 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm3
4509 ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm15
4510 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [0,8,0,8,0,8,0,8]
4511 ; AVX512DQ-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4512 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm17
4513 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm20
4514 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm9
4515 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm19
4516 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm29, %zmm19
4517 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm18
4518 ; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm29, %zmm18
4519 ; AVX512DQ-NEXT: movb $-64, %al
4520 ; AVX512DQ-NEXT: kmovw %eax, %k1
4521 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1}
4522 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [2,10,2,10,2,10,2,10]
4523 ; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4524 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm19, %zmm17
4525 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm21
4526 ; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm19, %zmm21
4527 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1}
4528 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm22
4529 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm23
4530 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2]
4531 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm24
4532 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm25
4533 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2]
4534 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
4535 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm0
4536 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4537 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm11
4538 ; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm19, %zmm11
4539 ; AVX512DQ-NEXT: vpermi2q %zmm10, %zmm14, %zmm19
4540 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm19 {%k1}
4541 ; AVX512DQ-NEXT: vmovdqa 704(%rdi), %ymm11
4542 ; AVX512DQ-NEXT: vmovdqa 640(%rdi), %ymm12
4543 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
4544 ; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %ymm26
4545 ; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %ymm27
4546 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2]
4547 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3]
4548 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19
4549 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
4550 ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4551 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm20
4552 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13
4553 ; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm0, %zmm13
4554 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1}
4555 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9]
4556 ; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4557 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
4558 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm23
4559 ; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm21, %zmm23
4560 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3]
4561 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
4562 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm20
4563 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm1
4564 ; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm0, %zmm1
4565 ; AVX512DQ-NEXT: vpermi2q %zmm10, %zmm14, %zmm0
4566 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
4567 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
4568 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3]
4569 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
4570 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22
4571 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
4572 ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4573 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4574 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm1
4575 ; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm0, %zmm1
4576 ; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
4577 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm2
4578 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm11
4579 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
4580 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
4581 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm9
4582 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6]
4583 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1}
4584 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm24
4585 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1
4586 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm0, %zmm1
4587 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
4588 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm2
4589 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
4590 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
4591 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
4592 ; AVX512DQ-NEXT: vpermi2q %zmm30, %zmm28, %zmm0
4593 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm14[0],zmm10[0],zmm14[2],zmm10[2],zmm14[4],zmm10[4],zmm14[6],zmm10[6]
4594 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
4595 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm25
4596 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [5,13,5,13,5,13,5,13]
4597 ; AVX512DQ-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4598 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm0
4599 ; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm27, %zmm0
4600 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm1
4601 ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm27, %zmm1
4602 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm8
4603 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4604 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm5[1],zmm16[1],zmm5[3],zmm16[3],zmm5[5],zmm16[5],zmm5[7],zmm16[7]
4605 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
4606 ; AVX512DQ-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4607 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1
4608 ; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm26, %zmm1
4609 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,7,15,7,15,7,15]
4610 ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4611 ; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm2, %zmm5
4612 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm12
4613 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm15[0],zmm3[0],zmm15[2],zmm3[2],zmm15[4],zmm3[4],zmm15[6],zmm3[6]
4614 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm15[1],zmm3[1],zmm15[3],zmm3[3],zmm15[5],zmm3[5],zmm15[7],zmm3[7]
4615 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm21, %zmm12
4616 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm27, %zmm15
4617 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm0 {%k1}
4618 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm17
4619 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
4620 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0
4621 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm27, %zmm0
4622 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm15
4623 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm9
4624 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm27, %zmm9
4625 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
4626 ; AVX512DQ-NEXT: vpermi2q %zmm30, %zmm28, %zmm27
4627 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm14[1],zmm10[1],zmm14[3],zmm10[3],zmm14[5],zmm10[5],zmm14[7],zmm10[7]
4628 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm9 {%k1}
4629 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm27
4630 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
4631 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm0
4632 ; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm26, %zmm0
4633 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm9
4634 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm26, %zmm9
4635 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
4636 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm16
4637 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
4638 ; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm2, %zmm4
4639 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4640 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm1
4641 ; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm29, %zmm1
4642 ; AVX512DQ-NEXT: vpermi2q %zmm10, %zmm14, %zmm29
4643 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm2, %zmm13
4644 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm4
4645 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm9
4646 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm26, %zmm9
4647 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm2, %zmm11
4648 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm8
4649 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm13
4650 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm11
4651 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm26, %zmm11
4652 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm2, %zmm13
4653 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm15
4654 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm13
4655 ; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm21, %zmm13
4656 ; AVX512DQ-NEXT: vpermi2q %zmm10, %zmm14, %zmm21
4657 ; AVX512DQ-NEXT: vpermi2q %zmm10, %zmm14, %zmm26
4658 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm2, %zmm14
4659 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm2
4660 ; AVX512DQ-NEXT: vinserti128 $1, 192(%rdi), %ymm2, %ymm2
4661 ; AVX512DQ-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0
4662 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
4663 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10
4664 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1}
4665 ; AVX512DQ-NEXT: vmovdqa 576(%rdi), %xmm1
4666 ; AVX512DQ-NEXT: vinserti128 $1, 704(%rdi), %ymm1, %ymm1
4667 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k1} = zmm28[0],zmm30[0],zmm28[2],zmm30[2],zmm28[4],zmm30[4],zmm28[6],zmm30[6]
4668 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm28[1],zmm30[1],zmm28[3],zmm30[3],zmm28[5],zmm30[5],zmm28[7],zmm30[7]
4669 ; AVX512DQ-NEXT: vmovdqa 512(%rdi), %xmm6
4670 ; AVX512DQ-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6
4671 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm1[0],ymm6[2],ymm1[2]
4672 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7
4673 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm23 {%k1}
4674 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
4675 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0
4676 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm21 {%k1}
4677 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3]
4678 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm1
4679 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm9[4,5,6,7]
4680 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm2
4681 ; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
4682 ; AVX512DQ-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
4683 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4
4684 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm8[4,5,6,7]
4685 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm3
4686 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 64(%rsi)
4687 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rsi)
4688 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx)
4689 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
4690 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, 64(%rcx)
4691 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
4692 ; AVX512DQ-NEXT: vmovaps %zmm0, (%rcx)
4693 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, 64(%r8)
4694 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, (%r8)
4695 ; AVX512DQ-NEXT: vmovdqa64 %zmm25, 64(%r9)
4696 ; AVX512DQ-NEXT: vmovdqa64 %zmm24, (%r9)
4697 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
4698 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, 64(%rax)
4699 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rax)
4700 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
4701 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rax)
4702 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, (%rax)
4703 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
4704 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%rax)
4705 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rax)
4706 ; AVX512DQ-NEXT: addq $264, %rsp # imm = 0x108
4707 ; AVX512DQ-NEXT: vzeroupper
4708 ; AVX512DQ-NEXT: retq
4710 ; AVX512DQ-FCP-LABEL: load_i64_stride8_vf16:
4711 ; AVX512DQ-FCP: # %bb.0:
4712 ; AVX512DQ-FCP-NEXT: subq $264, %rsp # imm = 0x108
4713 ; AVX512DQ-FCP-NEXT: vmovaps 576(%rdi), %zmm0
4714 ; AVX512DQ-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4715 ; AVX512DQ-FCP-NEXT: vmovaps 512(%rdi), %zmm0
4716 ; AVX512DQ-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4717 ; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm6
4718 ; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4
4719 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4720 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm7
4721 ; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %zmm0
4722 ; AVX512DQ-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill
4723 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm31
4724 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8
4725 ; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm10
4726 ; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm14
4727 ; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm30
4728 ; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm28
4729 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm16
4730 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5
4731 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm3
4732 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm15
4733 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [0,8,0,8,0,8,0,8]
4734 ; AVX512DQ-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4735 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm17
4736 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm20
4737 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm9
4738 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm19
4739 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm29, %zmm19
4740 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm18
4741 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm29, %zmm18
4742 ; AVX512DQ-FCP-NEXT: movb $-64, %al
4743 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
4744 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1}
4745 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [2,10,2,10,2,10,2,10]
4746 ; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4747 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm17
4748 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm21
4749 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm19, %zmm21
4750 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1}
4751 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm22
4752 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23
4753 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2]
4754 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm24
4755 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm25
4756 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2]
4757 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
4758 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm0
4759 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4760 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm11
4761 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm19, %zmm11
4762 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm19
4763 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 {%k1}
4764 ; AVX512DQ-FCP-NEXT: vmovdqa 704(%rdi), %ymm11
4765 ; AVX512DQ-FCP-NEXT: vmovdqa 640(%rdi), %ymm12
4766 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
4767 ; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %ymm26
4768 ; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %ymm27
4769 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2]
4770 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3]
4771 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19
4772 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
4773 ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4774 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm20
4775 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
4776 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm13
4777 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1}
4778 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9]
4779 ; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4780 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
4781 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm23
4782 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm21, %zmm23
4783 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3]
4784 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
4785 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm20
4786 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm1
4787 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm1
4788 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm0
4789 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
4790 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
4791 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3]
4792 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
4793 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22
4794 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
4795 ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4796 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4797 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
4798 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm1
4799 ; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
4800 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm2
4801 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm11
4802 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
4803 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
4804 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm9
4805 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6]
4806 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1}
4807 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm24
4808 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm1
4809 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm1
4810 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
4811 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm2
4812 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
4813 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
4814 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
4815 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm30, %zmm28, %zmm0
4816 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm14[0],zmm10[0],zmm14[2],zmm10[2],zmm14[4],zmm10[4],zmm14[6],zmm10[6]
4817 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
4818 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm25
4819 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [5,13,5,13,5,13,5,13]
4820 ; AVX512DQ-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4821 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm0
4822 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm27, %zmm0
4823 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm1
4824 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm27, %zmm1
4825 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm8
4826 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4827 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm5[1],zmm16[1],zmm5[3],zmm16[3],zmm5[5],zmm16[5],zmm5[7],zmm16[7]
4828 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
4829 ; AVX512DQ-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4830 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1
4831 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm26, %zmm1
4832 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,7,15,7,15,7,15]
4833 ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4834 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm5
4835 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm12
4836 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm15[0],zmm3[0],zmm15[2],zmm3[2],zmm15[4],zmm3[4],zmm15[6],zmm3[6]
4837 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm15[1],zmm3[1],zmm15[3],zmm3[3],zmm15[5],zmm3[5],zmm15[7],zmm3[7]
4838 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm12
4839 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm15
4840 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 {%k1}
4841 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm17
4842 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
4843 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0
4844 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm27, %zmm0
4845 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm15
4846 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm9
4847 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm27, %zmm9
4848 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
4849 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm30, %zmm28, %zmm27
4850 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm14[1],zmm10[1],zmm14[3],zmm10[3],zmm14[5],zmm10[5],zmm14[7],zmm10[7]
4851 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm9 {%k1}
4852 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm27
4853 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
4854 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm0
4855 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm26, %zmm0
4856 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm9
4857 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm26, %zmm9
4858 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
4859 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm16
4860 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
4861 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm2, %zmm4
4862 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4863 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm1
4864 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm29, %zmm1
4865 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm29
4866 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm13
4867 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm4
4868 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm9
4869 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm9
4870 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm11
4871 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm8
4872 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm13
4873 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm11
4874 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm26, %zmm11
4875 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm13
4876 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm15
4877 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm13
4878 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm13
4879 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm21
4880 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm26
4881 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm14
4882 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm2
4883 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm2, %ymm2
4884 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0
4885 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
4886 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10
4887 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1}
4888 ; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %xmm1
4889 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm1, %ymm1
4890 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k1} = zmm28[0],zmm30[0],zmm28[2],zmm30[2],zmm28[4],zmm30[4],zmm28[6],zmm30[6]
4891 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm28[1],zmm30[1],zmm28[3],zmm30[3],zmm28[5],zmm30[5],zmm28[7],zmm30[7]
4892 ; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %xmm6
4893 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6
4894 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm1[0],ymm6[2],ymm1[2]
4895 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7
4896 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm23 {%k1}
4897 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
4898 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0
4899 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm21 {%k1}
4900 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3]
4901 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm1
4902 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm9[4,5,6,7]
4903 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm2
4904 ; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
4905 ; AVX512DQ-FCP-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
4906 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4
4907 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm8[4,5,6,7]
4908 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm3
4909 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 64(%rsi)
4910 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rsi)
4911 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%rdx)
4912 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
4913 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 64(%rcx)
4914 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
4915 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rcx)
4916 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8)
4917 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%r8)
4918 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 64(%r9)
4919 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, (%r9)
4920 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
4921 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 64(%rax)
4922 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rax)
4923 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
4924 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax)
4925 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%rax)
4926 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
4927 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax)
4928 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rax)
4929 ; AVX512DQ-FCP-NEXT: addq $264, %rsp # imm = 0x108
4930 ; AVX512DQ-FCP-NEXT: vzeroupper
4931 ; AVX512DQ-FCP-NEXT: retq
4933 ; AVX512BW-LABEL: load_i64_stride8_vf16:
4934 ; AVX512BW: # %bb.0:
4935 ; AVX512BW-NEXT: subq $264, %rsp # imm = 0x108
4936 ; AVX512BW-NEXT: vmovaps 576(%rdi), %zmm0
4937 ; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4938 ; AVX512BW-NEXT: vmovaps 512(%rdi), %zmm0
4939 ; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4940 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm6
4941 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm4
4942 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4943 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm7
4944 ; AVX512BW-NEXT: vmovaps (%rdi), %zmm0
4945 ; AVX512BW-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill
4946 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm31
4947 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm8
4948 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm10
4949 ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm14
4950 ; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm30
4951 ; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm28
4952 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm16
4953 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm5
4954 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm3
4955 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm15
4956 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [0,8,0,8,0,8,0,8]
4957 ; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4958 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17
4959 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm20
4960 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9
4961 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm19
4962 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm19
4963 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm18
4964 ; AVX512BW-NEXT: vpermt2q %zmm16, %zmm29, %zmm18
4965 ; AVX512BW-NEXT: movb $-64, %al
4966 ; AVX512BW-NEXT: kmovd %eax, %k1
4967 ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1}
4968 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [2,10,2,10,2,10,2,10]
4969 ; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4970 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm17
4971 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm21
4972 ; AVX512BW-NEXT: vpermt2q %zmm16, %zmm19, %zmm21
4973 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1}
4974 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm22
4975 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm23
4976 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2]
4977 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm24
4978 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm25
4979 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2]
4980 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
4981 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm0
4982 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4983 ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm11
4984 ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm19, %zmm11
4985 ; AVX512BW-NEXT: vpermi2q %zmm10, %zmm14, %zmm19
4986 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 {%k1}
4987 ; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm11
4988 ; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm12
4989 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
4990 ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %ymm26
4991 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm27
4992 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2]
4993 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3]
4994 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19
4995 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
4996 ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4997 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm20
4998 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13
4999 ; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm13
5000 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1}
5001 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9]
5002 ; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5003 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
5004 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm23
5005 ; AVX512BW-NEXT: vpermt2q %zmm16, %zmm21, %zmm23
5006 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3]
5007 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
5008 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm20
5009 ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1
5010 ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm1
5011 ; AVX512BW-NEXT: vpermi2q %zmm10, %zmm14, %zmm0
5012 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
5013 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
5014 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3]
5015 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
5016 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22
5017 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
5018 ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5019 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5020 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1
5021 ; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm1
5022 ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
5023 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2
5024 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11
5025 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
5026 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
5027 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm9
5028 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6]
5029 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1}
5030 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm24
5031 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1
5032 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1
5033 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
5034 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2
5035 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
5036 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
5037 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
5038 ; AVX512BW-NEXT: vpermi2q %zmm30, %zmm28, %zmm0
5039 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm14[0],zmm10[0],zmm14[2],zmm10[2],zmm14[4],zmm10[4],zmm14[6],zmm10[6]
5040 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
5041 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm25
5042 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [5,13,5,13,5,13,5,13]
5043 ; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5044 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0
5045 ; AVX512BW-NEXT: vpermt2q %zmm31, %zmm27, %zmm0
5046 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1
5047 ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm27, %zmm1
5048 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm8
5049 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5050 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm5[1],zmm16[1],zmm5[3],zmm16[3],zmm5[5],zmm16[5],zmm5[7],zmm16[7]
5051 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
5052 ; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5053 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1
5054 ; AVX512BW-NEXT: vpermt2q %zmm16, %zmm26, %zmm1
5055 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,7,15,7,15,7,15]
5056 ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5057 ; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm5
5058 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm12
5059 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm15[0],zmm3[0],zmm15[2],zmm3[2],zmm15[4],zmm3[4],zmm15[6],zmm3[6]
5060 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm15[1],zmm3[1],zmm15[3],zmm3[3],zmm15[5],zmm3[5],zmm15[7],zmm3[7]
5061 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm12
5062 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm27, %zmm15
5063 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 {%k1}
5064 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm17
5065 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
5066 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0
5067 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm27, %zmm0
5068 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15
5069 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9
5070 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm27, %zmm9
5071 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
5072 ; AVX512BW-NEXT: vpermi2q %zmm30, %zmm28, %zmm27
5073 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm14[1],zmm10[1],zmm14[3],zmm10[3],zmm14[5],zmm10[5],zmm14[7],zmm10[7]
5074 ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm9 {%k1}
5075 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm27
5076 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
5077 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0
5078 ; AVX512BW-NEXT: vpermt2q %zmm31, %zmm26, %zmm0
5079 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm9
5080 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm26, %zmm9
5081 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
5082 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm16
5083 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
5084 ; AVX512BW-NEXT: vpermt2q %zmm31, %zmm2, %zmm4
5085 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5086 ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1
5087 ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm29, %zmm1
5088 ; AVX512BW-NEXT: vpermi2q %zmm10, %zmm14, %zmm29
5089 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm13
5090 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm4
5091 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm9
5092 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm26, %zmm9
5093 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm11
5094 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm8
5095 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm13
5096 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm11
5097 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm26, %zmm11
5098 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm13
5099 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15
5100 ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm13
5101 ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm21, %zmm13
5102 ; AVX512BW-NEXT: vpermi2q %zmm10, %zmm14, %zmm21
5103 ; AVX512BW-NEXT: vpermi2q %zmm10, %zmm14, %zmm26
5104 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm14
5105 ; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm2
5106 ; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm2, %ymm2
5107 ; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0
5108 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
5109 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10
5110 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1}
5111 ; AVX512BW-NEXT: vmovdqa 576(%rdi), %xmm1
5112 ; AVX512BW-NEXT: vinserti128 $1, 704(%rdi), %ymm1, %ymm1
5113 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k1} = zmm28[0],zmm30[0],zmm28[2],zmm30[2],zmm28[4],zmm30[4],zmm28[6],zmm30[6]
5114 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm28[1],zmm30[1],zmm28[3],zmm30[3],zmm28[5],zmm30[5],zmm28[7],zmm30[7]
5115 ; AVX512BW-NEXT: vmovdqa 512(%rdi), %xmm6
5116 ; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6
5117 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm1[0],ymm6[2],ymm1[2]
5118 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7
5119 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm23 {%k1}
5120 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
5121 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0
5122 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm21 {%k1}
5123 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3]
5124 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm1
5125 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm9[4,5,6,7]
5126 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm2
5127 ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
5128 ; AVX512BW-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
5129 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4
5130 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm8[4,5,6,7]
5131 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm3
5132 ; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rsi)
5133 ; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rsi)
5134 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx)
5135 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
5136 ; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%rcx)
5137 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
5138 ; AVX512BW-NEXT: vmovaps %zmm0, (%rcx)
5139 ; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r8)
5140 ; AVX512BW-NEXT: vmovdqa64 %zmm20, (%r8)
5141 ; AVX512BW-NEXT: vmovdqa64 %zmm25, 64(%r9)
5142 ; AVX512BW-NEXT: vmovdqa64 %zmm24, (%r9)
5143 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
5144 ; AVX512BW-NEXT: vmovdqa64 %zmm27, 64(%rax)
5145 ; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rax)
5146 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
5147 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax)
5148 ; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rax)
5149 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
5150 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax)
5151 ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rax)
5152 ; AVX512BW-NEXT: addq $264, %rsp # imm = 0x108
5153 ; AVX512BW-NEXT: vzeroupper
5154 ; AVX512BW-NEXT: retq
5156 ; AVX512BW-FCP-LABEL: load_i64_stride8_vf16:
5157 ; AVX512BW-FCP: # %bb.0:
5158 ; AVX512BW-FCP-NEXT: subq $264, %rsp # imm = 0x108
5159 ; AVX512BW-FCP-NEXT: vmovaps 576(%rdi), %zmm0
5160 ; AVX512BW-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5161 ; AVX512BW-FCP-NEXT: vmovaps 512(%rdi), %zmm0
5162 ; AVX512BW-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5163 ; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm6
5164 ; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4
5165 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5166 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm7
5167 ; AVX512BW-FCP-NEXT: vmovaps (%rdi), %zmm0
5168 ; AVX512BW-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill
5169 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm31
5170 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8
5171 ; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm10
5172 ; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm14
5173 ; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm30
5174 ; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm28
5175 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm16
5176 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5
5177 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm3
5178 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm15
5179 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [0,8,0,8,0,8,0,8]
5180 ; AVX512BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5181 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17
5182 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm20
5183 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm9
5184 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm19
5185 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm29, %zmm19
5186 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm18
5187 ; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm29, %zmm18
5188 ; AVX512BW-FCP-NEXT: movb $-64, %al
5189 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1
5190 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1}
5191 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [2,10,2,10,2,10,2,10]
5192 ; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5193 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm17
5194 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm21
5195 ; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm19, %zmm21
5196 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1}
5197 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm22
5198 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23
5199 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2]
5200 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm24
5201 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm25
5202 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2]
5203 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
5204 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm0
5205 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5206 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm11
5207 ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm19, %zmm11
5208 ; AVX512BW-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm19
5209 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 {%k1}
5210 ; AVX512BW-FCP-NEXT: vmovdqa 704(%rdi), %ymm11
5211 ; AVX512BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm12
5212 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
5213 ; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %ymm26
5214 ; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %ymm27
5215 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2]
5216 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3]
5217 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19
5218 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
5219 ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5220 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm20
5221 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
5222 ; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm13
5223 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1}
5224 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9]
5225 ; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5226 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
5227 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm23
5228 ; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm21, %zmm23
5229 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3]
5230 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
5231 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm20
5232 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm1
5233 ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm1
5234 ; AVX512BW-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm0
5235 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
5236 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
5237 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3]
5238 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
5239 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22
5240 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
5241 ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5242 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5243 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
5244 ; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm1
5245 ; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
5246 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm2
5247 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11
5248 ; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
5249 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
5250 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm9
5251 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6]
5252 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1}
5253 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm24
5254 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1
5255 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm1
5256 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
5257 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2
5258 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
5259 ; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
5260 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
5261 ; AVX512BW-FCP-NEXT: vpermi2q %zmm30, %zmm28, %zmm0
5262 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm14[0],zmm10[0],zmm14[2],zmm10[2],zmm14[4],zmm10[4],zmm14[6],zmm10[6]
5263 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
5264 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm25
5265 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [5,13,5,13,5,13,5,13]
5266 ; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5267 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm0
5268 ; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm27, %zmm0
5269 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1
5270 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm27, %zmm1
5271 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm8
5272 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5273 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm5[1],zmm16[1],zmm5[3],zmm16[3],zmm5[5],zmm16[5],zmm5[7],zmm16[7]
5274 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
5275 ; AVX512BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5276 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1
5277 ; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm26, %zmm1
5278 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,7,15,7,15,7,15]
5279 ; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5280 ; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm5
5281 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm12
5282 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm15[0],zmm3[0],zmm15[2],zmm3[2],zmm15[4],zmm3[4],zmm15[6],zmm3[6]
5283 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm15[1],zmm3[1],zmm15[3],zmm3[3],zmm15[5],zmm3[5],zmm15[7],zmm3[7]
5284 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm12
5285 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm15
5286 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 {%k1}
5287 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm17
5288 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
5289 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0
5290 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm27, %zmm0
5291 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm15
5292 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9
5293 ; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm27, %zmm9
5294 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
5295 ; AVX512BW-FCP-NEXT: vpermi2q %zmm30, %zmm28, %zmm27
5296 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm14[1],zmm10[1],zmm14[3],zmm10[3],zmm14[5],zmm10[5],zmm14[7],zmm10[7]
5297 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm9 {%k1}
5298 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm27
5299 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
5300 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0
5301 ; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm26, %zmm0
5302 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm9
5303 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm26, %zmm9
5304 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
5305 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm16
5306 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
5307 ; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm2, %zmm4
5308 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5309 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm1
5310 ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm29, %zmm1
5311 ; AVX512BW-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm29
5312 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm13
5313 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm4
5314 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9
5315 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm9
5316 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm11
5317 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm8
5318 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm13
5319 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm11
5320 ; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm26, %zmm11
5321 ; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm13
5322 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15
5323 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm13
5324 ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm13
5325 ; AVX512BW-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm21
5326 ; AVX512BW-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm26
5327 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm14
5328 ; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm2
5329 ; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm2, %ymm2
5330 ; AVX512BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0
5331 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
5332 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10
5333 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1}
5334 ; AVX512BW-FCP-NEXT: vmovdqa 576(%rdi), %xmm1
5335 ; AVX512BW-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm1, %ymm1
5336 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k1} = zmm28[0],zmm30[0],zmm28[2],zmm30[2],zmm28[4],zmm30[4],zmm28[6],zmm30[6]
5337 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm28[1],zmm30[1],zmm28[3],zmm30[3],zmm28[5],zmm30[5],zmm28[7],zmm30[7]
5338 ; AVX512BW-FCP-NEXT: vmovdqa 512(%rdi), %xmm6
5339 ; AVX512BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6
5340 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm1[0],ymm6[2],ymm1[2]
5341 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7
5342 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm23 {%k1}
5343 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
5344 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0
5345 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm21 {%k1}
5346 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3]
5347 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm1
5348 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm9[4,5,6,7]
5349 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm2
5350 ; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
5351 ; AVX512BW-FCP-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
5352 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4
5353 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm8[4,5,6,7]
5354 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm3
5355 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%rsi)
5356 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rsi)
5357 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rdx)
5358 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
5359 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rcx)
5360 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
5361 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%rcx)
5362 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8)
5363 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, (%r8)
5364 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 64(%r9)
5365 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, (%r9)
5366 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5367 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 64(%rax)
5368 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, (%rax)
5369 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5370 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax)
5371 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, (%rax)
5372 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5373 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax)
5374 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax)
5375 ; AVX512BW-FCP-NEXT: addq $264, %rsp # imm = 0x108
5376 ; AVX512BW-FCP-NEXT: vzeroupper
5377 ; AVX512BW-FCP-NEXT: retq
5379 ; AVX512DQ-BW-LABEL: load_i64_stride8_vf16:
5380 ; AVX512DQ-BW: # %bb.0:
5381 ; AVX512DQ-BW-NEXT: subq $264, %rsp # imm = 0x108
5382 ; AVX512DQ-BW-NEXT: vmovaps 576(%rdi), %zmm0
5383 ; AVX512DQ-BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5384 ; AVX512DQ-BW-NEXT: vmovaps 512(%rdi), %zmm0
5385 ; AVX512DQ-BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5386 ; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm6
5387 ; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm4
5388 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5389 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm7
5390 ; AVX512DQ-BW-NEXT: vmovaps (%rdi), %zmm0
5391 ; AVX512DQ-BW-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill
5392 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm31
5393 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm8
5394 ; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm10
5395 ; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm14
5396 ; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm30
5397 ; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm28
5398 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm16
5399 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm5
5400 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm3
5401 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm15
5402 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [0,8,0,8,0,8,0,8]
5403 ; AVX512DQ-BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5404 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm17
5405 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm20
5406 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm9
5407 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm19
5408 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm19
5409 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm18
5410 ; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm29, %zmm18
5411 ; AVX512DQ-BW-NEXT: movb $-64, %al
5412 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1
5413 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1}
5414 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [2,10,2,10,2,10,2,10]
5415 ; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5416 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm17
5417 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm21
5418 ; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm19, %zmm21
5419 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1}
5420 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %ymm22
5421 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %ymm23
5422 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2]
5423 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %ymm24
5424 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %ymm25
5425 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2]
5426 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
5427 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm0
5428 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5429 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm11
5430 ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm19, %zmm11
5431 ; AVX512DQ-BW-NEXT: vpermi2q %zmm10, %zmm14, %zmm19
5432 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm19 {%k1}
5433 ; AVX512DQ-BW-NEXT: vmovdqa 704(%rdi), %ymm11
5434 ; AVX512DQ-BW-NEXT: vmovdqa 640(%rdi), %ymm12
5435 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
5436 ; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %ymm26
5437 ; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %ymm27
5438 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2]
5439 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3]
5440 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19
5441 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
5442 ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5443 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm20
5444 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13
5445 ; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm13
5446 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1}
5447 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9]
5448 ; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5449 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
5450 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm23
5451 ; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm21, %zmm23
5452 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3]
5453 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
5454 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm20
5455 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm1
5456 ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm1
5457 ; AVX512DQ-BW-NEXT: vpermi2q %zmm10, %zmm14, %zmm0
5458 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
5459 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
5460 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3]
5461 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
5462 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22
5463 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
5464 ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5465 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5466 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1
5467 ; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm1
5468 ; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
5469 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm2
5470 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm11
5471 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
5472 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
5473 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm9
5474 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6]
5475 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1}
5476 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm24
5477 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1
5478 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1
5479 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
5480 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm2
5481 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
5482 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
5483 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
5484 ; AVX512DQ-BW-NEXT: vpermi2q %zmm30, %zmm28, %zmm0
5485 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm14[0],zmm10[0],zmm14[2],zmm10[2],zmm14[4],zmm10[4],zmm14[6],zmm10[6]
5486 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
5487 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm25
5488 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [5,13,5,13,5,13,5,13]
5489 ; AVX512DQ-BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5490 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm0
5491 ; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm27, %zmm0
5492 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm1
5493 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm27, %zmm1
5494 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm8
5495 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5496 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm5[1],zmm16[1],zmm5[3],zmm16[3],zmm5[5],zmm16[5],zmm5[7],zmm16[7]
5497 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
5498 ; AVX512DQ-BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5499 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1
5500 ; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm26, %zmm1
5501 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,7,15,7,15,7,15]
5502 ; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5503 ; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm5
5504 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm12
5505 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm15[0],zmm3[0],zmm15[2],zmm3[2],zmm15[4],zmm3[4],zmm15[6],zmm3[6]
5506 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm15[1],zmm3[1],zmm15[3],zmm3[3],zmm15[5],zmm3[5],zmm15[7],zmm3[7]
5507 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm12
5508 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm27, %zmm15
5509 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm0 {%k1}
5510 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm17
5511 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
5512 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0
5513 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm27, %zmm0
5514 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm15
5515 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm9
5516 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm27, %zmm9
5517 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
5518 ; AVX512DQ-BW-NEXT: vpermi2q %zmm30, %zmm28, %zmm27
5519 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm14[1],zmm10[1],zmm14[3],zmm10[3],zmm14[5],zmm10[5],zmm14[7],zmm10[7]
5520 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm9 {%k1}
5521 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm27
5522 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
5523 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm0
5524 ; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm26, %zmm0
5525 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm9
5526 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm26, %zmm9
5527 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
5528 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm16
5529 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
5530 ; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm2, %zmm4
5531 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5532 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm1
5533 ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm29, %zmm1
5534 ; AVX512DQ-BW-NEXT: vpermi2q %zmm10, %zmm14, %zmm29
5535 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm13
5536 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm4
5537 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm9
5538 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm26, %zmm9
5539 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm11
5540 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm8
5541 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm13
5542 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm11
5543 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm26, %zmm11
5544 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm13
5545 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm15
5546 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm13
5547 ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm21, %zmm13
5548 ; AVX512DQ-BW-NEXT: vpermi2q %zmm10, %zmm14, %zmm21
5549 ; AVX512DQ-BW-NEXT: vpermi2q %zmm10, %zmm14, %zmm26
5550 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm14
5551 ; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm2
5552 ; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rdi), %ymm2, %ymm2
5553 ; AVX512DQ-BW-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0
5554 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
5555 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10
5556 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1}
5557 ; AVX512DQ-BW-NEXT: vmovdqa 576(%rdi), %xmm1
5558 ; AVX512DQ-BW-NEXT: vinserti128 $1, 704(%rdi), %ymm1, %ymm1
5559 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k1} = zmm28[0],zmm30[0],zmm28[2],zmm30[2],zmm28[4],zmm30[4],zmm28[6],zmm30[6]
5560 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm28[1],zmm30[1],zmm28[3],zmm30[3],zmm28[5],zmm30[5],zmm28[7],zmm30[7]
5561 ; AVX512DQ-BW-NEXT: vmovdqa 512(%rdi), %xmm6
5562 ; AVX512DQ-BW-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6
5563 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm1[0],ymm6[2],ymm1[2]
5564 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7
5565 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm23 {%k1}
5566 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
5567 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0
5568 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm21 {%k1}
5569 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3]
5570 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm1
5571 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm9[4,5,6,7]
5572 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm2
5573 ; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
5574 ; AVX512DQ-BW-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
5575 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4
5576 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm8[4,5,6,7]
5577 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm3
5578 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 64(%rsi)
5579 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rsi)
5580 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 64(%rdx)
5581 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rdx)
5582 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 64(%rcx)
5583 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
5584 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%rcx)
5585 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 64(%r8)
5586 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, (%r8)
5587 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 64(%r9)
5588 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, (%r9)
5589 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
5590 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 64(%rax)
5591 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, (%rax)
5592 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
5593 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 64(%rax)
5594 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, (%rax)
5595 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
5596 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 64(%rax)
5597 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rax)
5598 ; AVX512DQ-BW-NEXT: addq $264, %rsp # imm = 0x108
5599 ; AVX512DQ-BW-NEXT: vzeroupper
5600 ; AVX512DQ-BW-NEXT: retq
5602 ; AVX512DQ-BW-FCP-LABEL: load_i64_stride8_vf16:
5603 ; AVX512DQ-BW-FCP: # %bb.0:
5604 ; AVX512DQ-BW-FCP-NEXT: subq $264, %rsp # imm = 0x108
5605 ; AVX512DQ-BW-FCP-NEXT: vmovaps 576(%rdi), %zmm0
5606 ; AVX512DQ-BW-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5607 ; AVX512DQ-BW-FCP-NEXT: vmovaps 512(%rdi), %zmm0
5608 ; AVX512DQ-BW-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5609 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm6
5610 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4
5611 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5612 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm7
5613 ; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %zmm0
5614 ; AVX512DQ-BW-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill
5615 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm31
5616 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8
5617 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm10
5618 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm14
5619 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm30
5620 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm28
5621 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm16
5622 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5
5623 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm3
5624 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm15
5625 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [0,8,0,8,0,8,0,8]
5626 ; AVX512DQ-BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5627 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17
5628 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm20
5629 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm9
5630 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm19
5631 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm29, %zmm19
5632 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm18
5633 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm29, %zmm18
5634 ; AVX512DQ-BW-FCP-NEXT: movb $-64, %al
5635 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
5636 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1}
5637 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [2,10,2,10,2,10,2,10]
5638 ; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5639 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm17
5640 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm21
5641 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm19, %zmm21
5642 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1}
5643 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm22
5644 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23
5645 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2]
5646 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm24
5647 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm25
5648 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2]
5649 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
5650 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm0
5651 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5652 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm11
5653 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm19, %zmm11
5654 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm19
5655 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 {%k1}
5656 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 704(%rdi), %ymm11
5657 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm12
5658 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
5659 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %ymm26
5660 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %ymm27
5661 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2]
5662 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3]
5663 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19
5664 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
5665 ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5666 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm20
5667 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
5668 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm13
5669 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1}
5670 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9]
5671 ; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5672 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
5673 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm23
5674 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm21, %zmm23
5675 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3]
5676 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
5677 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm20
5678 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm1
5679 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm1
5680 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm0
5681 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
5682 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
5683 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3]
5684 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
5685 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22
5686 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
5687 ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5688 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5689 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
5690 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm1
5691 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
5692 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm2
5693 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11
5694 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
5695 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
5696 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm9
5697 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6]
5698 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1}
5699 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm24
5700 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1
5701 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm1
5702 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
5703 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2
5704 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
5705 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
5706 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
5707 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm30, %zmm28, %zmm0
5708 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm14[0],zmm10[0],zmm14[2],zmm10[2],zmm14[4],zmm10[4],zmm14[6],zmm10[6]
5709 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
5710 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm25
5711 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [5,13,5,13,5,13,5,13]
5712 ; AVX512DQ-BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5713 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm0
5714 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm27, %zmm0
5715 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1
5716 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm27, %zmm1
5717 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm8
5718 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5719 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm5[1],zmm16[1],zmm5[3],zmm16[3],zmm5[5],zmm16[5],zmm5[7],zmm16[7]
5720 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
5721 ; AVX512DQ-BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5722 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1
5723 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm26, %zmm1
5724 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,7,15,7,15,7,15]
5725 ; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5726 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm5
5727 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm12
5728 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm15[0],zmm3[0],zmm15[2],zmm3[2],zmm15[4],zmm3[4],zmm15[6],zmm3[6]
5729 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm15[1],zmm3[1],zmm15[3],zmm3[3],zmm15[5],zmm3[5],zmm15[7],zmm3[7]
5730 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm12
5731 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm15
5732 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 {%k1}
5733 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm17
5734 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
5735 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0
5736 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm27, %zmm0
5737 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm15
5738 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9
5739 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm27, %zmm9
5740 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
5741 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm30, %zmm28, %zmm27
5742 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm14[1],zmm10[1],zmm14[3],zmm10[3],zmm14[5],zmm10[5],zmm14[7],zmm10[7]
5743 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm9 {%k1}
5744 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm27
5745 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
5746 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0
5747 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm26, %zmm0
5748 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm9
5749 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm26, %zmm9
5750 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
5751 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm16
5752 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
5753 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm2, %zmm4
5754 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5755 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm1
5756 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm29, %zmm1
5757 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm29
5758 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm13
5759 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm4
5760 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9
5761 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm9
5762 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm11
5763 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm8
5764 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm13
5765 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm11
5766 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm26, %zmm11
5767 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm13
5768 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15
5769 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm13
5770 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm13
5771 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm21
5772 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm26
5773 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm14
5774 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm2
5775 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm2, %ymm2
5776 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0
5777 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
5778 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10
5779 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1}
5780 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 576(%rdi), %xmm1
5781 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm1, %ymm1
5782 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k1} = zmm28[0],zmm30[0],zmm28[2],zmm30[2],zmm28[4],zmm30[4],zmm28[6],zmm30[6]
5783 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm28[1],zmm30[1],zmm28[3],zmm30[3],zmm28[5],zmm30[5],zmm28[7],zmm30[7]
5784 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 512(%rdi), %xmm6
5785 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6
5786 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm1[0],ymm6[2],ymm1[2]
5787 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7
5788 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm23 {%k1}
5789 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
5790 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0
5791 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm21 {%k1}
5792 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3]
5793 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm1
5794 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm9[4,5,6,7]
5795 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm2
5796 ; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
5797 ; AVX512DQ-BW-FCP-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
5798 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4
5799 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm8[4,5,6,7]
5800 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm3
5801 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%rsi)
5802 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rsi)
5803 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rdx)
5804 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
5805 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rcx)
5806 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
5807 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%rcx)
5808 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8)
5809 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, (%r8)
5810 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 64(%r9)
5811 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, (%r9)
5812 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5813 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 64(%rax)
5814 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, (%rax)
5815 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5816 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax)
5817 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, (%rax)
5818 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5819 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax)
5820 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax)
5821 ; AVX512DQ-BW-FCP-NEXT: addq $264, %rsp # imm = 0x108
5822 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
5823 ; AVX512DQ-BW-FCP-NEXT: retq
5824 %wide.vec = load <128 x i64>, ptr %in.vec, align 64
5825 %strided.vec0 = shufflevector <128 x i64> %wide.vec, <128 x i64> poison, <16 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120>
5826 %strided.vec1 = shufflevector <128 x i64> %wide.vec, <128 x i64> poison, <16 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 65, i32 73, i32 81, i32 89, i32 97, i32 105, i32 113, i32 121>
5827 %strided.vec2 = shufflevector <128 x i64> %wide.vec, <128 x i64> poison, <16 x i32> <i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58, i32 66, i32 74, i32 82, i32 90, i32 98, i32 106, i32 114, i32 122>
5828 %strided.vec3 = shufflevector <128 x i64> %wide.vec, <128 x i64> poison, <16 x i32> <i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59, i32 67, i32 75, i32 83, i32 91, i32 99, i32 107, i32 115, i32 123>
5829 %strided.vec4 = shufflevector <128 x i64> %wide.vec, <128 x i64> poison, <16 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60, i32 68, i32 76, i32 84, i32 92, i32 100, i32 108, i32 116, i32 124>
5830 %strided.vec5 = shufflevector <128 x i64> %wide.vec, <128 x i64> poison, <16 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 69, i32 77, i32 85, i32 93, i32 101, i32 109, i32 117, i32 125>
5831 %strided.vec6 = shufflevector <128 x i64> %wide.vec, <128 x i64> poison, <16 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 70, i32 78, i32 86, i32 94, i32 102, i32 110, i32 118, i32 126>
5832 %strided.vec7 = shufflevector <128 x i64> %wide.vec, <128 x i64> poison, <16 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63, i32 71, i32 79, i32 87, i32 95, i32 103, i32 111, i32 119, i32 127>
5833 store <16 x i64> %strided.vec0, ptr %out.vec0, align 64
5834 store <16 x i64> %strided.vec1, ptr %out.vec1, align 64
5835 store <16 x i64> %strided.vec2, ptr %out.vec2, align 64
5836 store <16 x i64> %strided.vec3, ptr %out.vec3, align 64
5837 store <16 x i64> %strided.vec4, ptr %out.vec4, align 64
5838 store <16 x i64> %strided.vec5, ptr %out.vec5, align 64
5839 store <16 x i64> %strided.vec6, ptr %out.vec6, align 64
5840 store <16 x i64> %strided.vec7, ptr %out.vec7, align 64
5844 define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind {
5845 ; SSE-LABEL: load_i64_stride8_vf32:
5847 ; SSE-NEXT: subq $1688, %rsp # imm = 0x698
5848 ; SSE-NEXT: movaps 832(%rdi), %xmm0
5849 ; SSE-NEXT: movaps 320(%rdi), %xmm2
5850 ; SSE-NEXT: movaps 256(%rdi), %xmm8
5851 ; SSE-NEXT: movaps 960(%rdi), %xmm1
5852 ; SSE-NEXT: movaps 896(%rdi), %xmm10
5853 ; SSE-NEXT: movaps 448(%rdi), %xmm4
5854 ; SSE-NEXT: movaps 384(%rdi), %xmm9
5855 ; SSE-NEXT: movaps 576(%rdi), %xmm3
5856 ; SSE-NEXT: movaps 512(%rdi), %xmm12
5857 ; SSE-NEXT: movaps 64(%rdi), %xmm6
5858 ; SSE-NEXT: movaps (%rdi), %xmm11
5859 ; SSE-NEXT: movaps 704(%rdi), %xmm5
5860 ; SSE-NEXT: movaps 640(%rdi), %xmm14
5861 ; SSE-NEXT: movaps 192(%rdi), %xmm7
5862 ; SSE-NEXT: movaps 128(%rdi), %xmm13
5863 ; SSE-NEXT: movaps %xmm13, %xmm15
5864 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm7[0]
5865 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5866 ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm7[1]
5867 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5868 ; SSE-NEXT: movaps %xmm11, %xmm7
5869 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm6[0]
5870 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5871 ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm6[1]
5872 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5873 ; SSE-NEXT: movaps %xmm9, %xmm6
5874 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm4[0]
5875 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5876 ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm4[1]
5877 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5878 ; SSE-NEXT: movaps %xmm8, %xmm4
5879 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0]
5880 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5881 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1]
5882 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5883 ; SSE-NEXT: movaps %xmm14, %xmm2
5884 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm5[0]
5885 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5886 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm5[1]
5887 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5888 ; SSE-NEXT: movaps %xmm12, %xmm2
5889 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
5890 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5891 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm3[1]
5892 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5893 ; SSE-NEXT: movaps %xmm10, %xmm2
5894 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
5895 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5896 ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm1[1]
5897 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5898 ; SSE-NEXT: movaps 768(%rdi), %xmm1
5899 ; SSE-NEXT: movaps %xmm1, %xmm2
5900 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
5901 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5902 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
5903 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5904 ; SSE-NEXT: movaps 1216(%rdi), %xmm0
5905 ; SSE-NEXT: movaps 1152(%rdi), %xmm1
5906 ; SSE-NEXT: movaps %xmm1, %xmm2
5907 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
5908 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5909 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
5910 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5911 ; SSE-NEXT: movaps 1088(%rdi), %xmm0
5912 ; SSE-NEXT: movaps 1024(%rdi), %xmm1
5913 ; SSE-NEXT: movaps %xmm1, %xmm2
5914 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
5915 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5916 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
5917 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5918 ; SSE-NEXT: movaps 1472(%rdi), %xmm0
5919 ; SSE-NEXT: movaps 1408(%rdi), %xmm1
5920 ; SSE-NEXT: movaps %xmm1, %xmm2
5921 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
5922 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5923 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
5924 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5925 ; SSE-NEXT: movaps 1344(%rdi), %xmm0
5926 ; SSE-NEXT: movaps 1280(%rdi), %xmm1
5927 ; SSE-NEXT: movaps %xmm1, %xmm2
5928 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
5929 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5930 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
5931 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5932 ; SSE-NEXT: movaps 1728(%rdi), %xmm0
5933 ; SSE-NEXT: movaps 1664(%rdi), %xmm1
5934 ; SSE-NEXT: movaps %xmm1, %xmm2
5935 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
5936 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5937 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
5938 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5939 ; SSE-NEXT: movaps 1600(%rdi), %xmm0
5940 ; SSE-NEXT: movaps 1536(%rdi), %xmm1
5941 ; SSE-NEXT: movaps %xmm1, %xmm2
5942 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
5943 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5944 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
5945 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5946 ; SSE-NEXT: movaps 1984(%rdi), %xmm0
5947 ; SSE-NEXT: movaps 1920(%rdi), %xmm1
5948 ; SSE-NEXT: movaps %xmm1, %xmm2
5949 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
5950 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5951 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
5952 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5953 ; SSE-NEXT: movaps 1856(%rdi), %xmm0
5954 ; SSE-NEXT: movaps 1792(%rdi), %xmm1
5955 ; SSE-NEXT: movaps %xmm1, %xmm2
5956 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
5957 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5958 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
5959 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5960 ; SSE-NEXT: movaps 80(%rdi), %xmm0
5961 ; SSE-NEXT: movaps 16(%rdi), %xmm1
5962 ; SSE-NEXT: movaps %xmm1, %xmm2
5963 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
5964 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5965 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
5966 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5967 ; SSE-NEXT: movaps 208(%rdi), %xmm0
5968 ; SSE-NEXT: movaps 144(%rdi), %xmm1
5969 ; SSE-NEXT: movaps %xmm1, %xmm2
5970 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
5971 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5972 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
5973 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5974 ; SSE-NEXT: movaps 336(%rdi), %xmm0
5975 ; SSE-NEXT: movaps 272(%rdi), %xmm1
5976 ; SSE-NEXT: movaps %xmm1, %xmm2
5977 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
5978 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5979 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
5980 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5981 ; SSE-NEXT: movaps 464(%rdi), %xmm0
5982 ; SSE-NEXT: movaps 400(%rdi), %xmm1
5983 ; SSE-NEXT: movaps %xmm1, %xmm2
5984 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
5985 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5986 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
5987 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5988 ; SSE-NEXT: movaps 592(%rdi), %xmm0
5989 ; SSE-NEXT: movaps 528(%rdi), %xmm1
5990 ; SSE-NEXT: movaps %xmm1, %xmm2
5991 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
5992 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5993 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
5994 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5995 ; SSE-NEXT: movaps 720(%rdi), %xmm0
5996 ; SSE-NEXT: movaps 656(%rdi), %xmm1
5997 ; SSE-NEXT: movaps %xmm1, %xmm2
5998 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
5999 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6000 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
6001 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6002 ; SSE-NEXT: movaps 848(%rdi), %xmm0
6003 ; SSE-NEXT: movaps 784(%rdi), %xmm1
6004 ; SSE-NEXT: movaps %xmm1, %xmm2
6005 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
6006 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6007 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
6008 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6009 ; SSE-NEXT: movaps 976(%rdi), %xmm0
6010 ; SSE-NEXT: movaps 912(%rdi), %xmm1
6011 ; SSE-NEXT: movaps %xmm1, %xmm2
6012 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
6013 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6014 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
6015 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6016 ; SSE-NEXT: movaps 1104(%rdi), %xmm0
6017 ; SSE-NEXT: movaps 1040(%rdi), %xmm1
6018 ; SSE-NEXT: movaps %xmm1, %xmm2
6019 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
6020 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6021 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
6022 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6023 ; SSE-NEXT: movaps 1232(%rdi), %xmm0
6024 ; SSE-NEXT: movaps 1168(%rdi), %xmm1
6025 ; SSE-NEXT: movaps %xmm1, %xmm2
6026 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
6027 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6028 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
6029 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6030 ; SSE-NEXT: movaps 1360(%rdi), %xmm0
6031 ; SSE-NEXT: movaps 1296(%rdi), %xmm1
6032 ; SSE-NEXT: movaps %xmm1, %xmm2
6033 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
6034 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6035 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
6036 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6037 ; SSE-NEXT: movaps 1488(%rdi), %xmm0
6038 ; SSE-NEXT: movaps 1424(%rdi), %xmm1
6039 ; SSE-NEXT: movaps %xmm1, %xmm2
6040 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
6041 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6042 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
6043 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6044 ; SSE-NEXT: movaps 1616(%rdi), %xmm0
6045 ; SSE-NEXT: movaps 1552(%rdi), %xmm1
6046 ; SSE-NEXT: movaps %xmm1, %xmm2
6047 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
6048 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6049 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
6050 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6051 ; SSE-NEXT: movaps 1744(%rdi), %xmm0
6052 ; SSE-NEXT: movaps 1680(%rdi), %xmm1
6053 ; SSE-NEXT: movaps %xmm1, %xmm2
6054 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
6055 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6056 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
6057 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6058 ; SSE-NEXT: movaps 1872(%rdi), %xmm0
6059 ; SSE-NEXT: movaps 1808(%rdi), %xmm1
6060 ; SSE-NEXT: movaps %xmm1, %xmm2
6061 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
6062 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6063 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
6064 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6065 ; SSE-NEXT: movaps 2000(%rdi), %xmm0
6066 ; SSE-NEXT: movaps 1936(%rdi), %xmm1
6067 ; SSE-NEXT: movaps %xmm1, %xmm2
6068 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
6069 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6070 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
6071 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6072 ; SSE-NEXT: movaps 96(%rdi), %xmm0
6073 ; SSE-NEXT: movaps 32(%rdi), %xmm1
6074 ; SSE-NEXT: movaps %xmm1, %xmm2
6075 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
6076 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6077 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
6078 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6079 ; SSE-NEXT: movaps 224(%rdi), %xmm0
6080 ; SSE-NEXT: movaps 160(%rdi), %xmm1
6081 ; SSE-NEXT: movaps %xmm1, %xmm2
6082 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
6083 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6084 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
6085 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6086 ; SSE-NEXT: movaps 352(%rdi), %xmm0
6087 ; SSE-NEXT: movaps 288(%rdi), %xmm1
6088 ; SSE-NEXT: movaps %xmm1, %xmm2
6089 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
6090 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6091 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
6092 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6093 ; SSE-NEXT: movaps 480(%rdi), %xmm0
6094 ; SSE-NEXT: movaps 416(%rdi), %xmm1
6095 ; SSE-NEXT: movaps %xmm1, %xmm2
6096 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
6097 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6098 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
6099 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6100 ; SSE-NEXT: movaps 608(%rdi), %xmm0
6101 ; SSE-NEXT: movaps 544(%rdi), %xmm1
6102 ; SSE-NEXT: movaps %xmm1, %xmm2
6103 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
6104 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6105 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
6106 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6107 ; SSE-NEXT: movaps 736(%rdi), %xmm0
6108 ; SSE-NEXT: movaps 672(%rdi), %xmm1
6109 ; SSE-NEXT: movaps %xmm1, %xmm2
6110 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
6111 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6112 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
6113 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6114 ; SSE-NEXT: movaps 864(%rdi), %xmm0
6115 ; SSE-NEXT: movaps 800(%rdi), %xmm1
6116 ; SSE-NEXT: movaps %xmm1, %xmm2
6117 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
6118 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6119 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
6120 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6121 ; SSE-NEXT: movaps 992(%rdi), %xmm0
6122 ; SSE-NEXT: movaps 928(%rdi), %xmm1
6123 ; SSE-NEXT: movaps %xmm1, %xmm2
6124 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
6125 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6126 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
6127 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6128 ; SSE-NEXT: movaps 1120(%rdi), %xmm0
6129 ; SSE-NEXT: movaps 1056(%rdi), %xmm1
6130 ; SSE-NEXT: movaps %xmm1, %xmm2
6131 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
6132 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6133 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
6134 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6135 ; SSE-NEXT: movaps 1248(%rdi), %xmm0
6136 ; SSE-NEXT: movaps 1184(%rdi), %xmm1
6137 ; SSE-NEXT: movaps %xmm1, %xmm2
6138 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
6139 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6140 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
6141 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6142 ; SSE-NEXT: movaps 1376(%rdi), %xmm0
6143 ; SSE-NEXT: movaps 1312(%rdi), %xmm1
6144 ; SSE-NEXT: movaps %xmm1, %xmm2
6145 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
6146 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6147 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
6148 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6149 ; SSE-NEXT: movaps 1504(%rdi), %xmm0
6150 ; SSE-NEXT: movaps 1440(%rdi), %xmm1
6151 ; SSE-NEXT: movaps %xmm1, %xmm2
6152 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
6153 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6154 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
6155 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6156 ; SSE-NEXT: movaps 1632(%rdi), %xmm0
6157 ; SSE-NEXT: movaps 1568(%rdi), %xmm1
6158 ; SSE-NEXT: movaps %xmm1, %xmm2
6159 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
6160 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6161 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
6162 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6163 ; SSE-NEXT: movaps 1760(%rdi), %xmm0
6164 ; SSE-NEXT: movaps 1696(%rdi), %xmm1
6165 ; SSE-NEXT: movaps %xmm1, %xmm2
6166 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
6167 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6168 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
6169 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6170 ; SSE-NEXT: movaps 1888(%rdi), %xmm0
6171 ; SSE-NEXT: movaps 1824(%rdi), %xmm1
6172 ; SSE-NEXT: movaps %xmm1, %xmm2
6173 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
6174 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6175 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
6176 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6177 ; SSE-NEXT: movaps 2016(%rdi), %xmm0
6178 ; SSE-NEXT: movaps 1952(%rdi), %xmm1
6179 ; SSE-NEXT: movaps %xmm1, %xmm2
6180 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
6181 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6182 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
6183 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6184 ; SSE-NEXT: movaps 112(%rdi), %xmm0
6185 ; SSE-NEXT: movaps 48(%rdi), %xmm1
6186 ; SSE-NEXT: movaps %xmm1, %xmm2
6187 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
6188 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6189 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
6190 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6191 ; SSE-NEXT: movaps 240(%rdi), %xmm0
6192 ; SSE-NEXT: movaps 176(%rdi), %xmm1
6193 ; SSE-NEXT: movaps %xmm1, %xmm2
6194 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
6195 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6196 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
6197 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6198 ; SSE-NEXT: movaps 368(%rdi), %xmm0
6199 ; SSE-NEXT: movaps 304(%rdi), %xmm1
6200 ; SSE-NEXT: movaps %xmm1, %xmm2
6201 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
6202 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6203 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
6204 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6205 ; SSE-NEXT: movaps 496(%rdi), %xmm0
6206 ; SSE-NEXT: movaps 432(%rdi), %xmm1
6207 ; SSE-NEXT: movaps %xmm1, %xmm2
6208 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
6209 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6210 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
6211 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6212 ; SSE-NEXT: movaps 624(%rdi), %xmm0
6213 ; SSE-NEXT: movaps 560(%rdi), %xmm1
6214 ; SSE-NEXT: movaps %xmm1, %xmm2
6215 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
6216 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6217 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
6218 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6219 ; SSE-NEXT: movaps 752(%rdi), %xmm0
6220 ; SSE-NEXT: movaps 688(%rdi), %xmm1
6221 ; SSE-NEXT: movaps %xmm1, %xmm2
6222 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
6223 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6224 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
6225 ; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
6226 ; SSE-NEXT: movaps 880(%rdi), %xmm0
6227 ; SSE-NEXT: movaps 816(%rdi), %xmm1
6228 ; SSE-NEXT: movaps %xmm1, %xmm2
6229 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
6230 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6231 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
6232 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6233 ; SSE-NEXT: movaps 1008(%rdi), %xmm0
6234 ; SSE-NEXT: movaps 944(%rdi), %xmm12
6235 ; SSE-NEXT: movaps %xmm12, %xmm1
6236 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
6237 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6238 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1]
6239 ; SSE-NEXT: movaps 1136(%rdi), %xmm0
6240 ; SSE-NEXT: movaps 1072(%rdi), %xmm11
6241 ; SSE-NEXT: movaps %xmm11, %xmm1
6242 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
6243 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6244 ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1]
6245 ; SSE-NEXT: movaps 1264(%rdi), %xmm0
6246 ; SSE-NEXT: movaps 1200(%rdi), %xmm14
6247 ; SSE-NEXT: movaps %xmm14, %xmm1
6248 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
6249 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6250 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1]
6251 ; SSE-NEXT: movaps 1392(%rdi), %xmm0
6252 ; SSE-NEXT: movaps 1328(%rdi), %xmm13
6253 ; SSE-NEXT: movaps %xmm13, %xmm15
6254 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0]
6255 ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1]
6256 ; SSE-NEXT: movaps 1520(%rdi), %xmm0
6257 ; SSE-NEXT: movaps 1456(%rdi), %xmm7
6258 ; SSE-NEXT: movaps %xmm7, %xmm10
6259 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0]
6260 ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1]
6261 ; SSE-NEXT: movaps 1648(%rdi), %xmm0
6262 ; SSE-NEXT: movaps 1584(%rdi), %xmm5
6263 ; SSE-NEXT: movaps %xmm5, %xmm9
6264 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0]
6265 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
6266 ; SSE-NEXT: movaps 1776(%rdi), %xmm0
6267 ; SSE-NEXT: movaps 1712(%rdi), %xmm4
6268 ; SSE-NEXT: movaps %xmm4, %xmm8
6269 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0]
6270 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
6271 ; SSE-NEXT: movaps 1904(%rdi), %xmm0
6272 ; SSE-NEXT: movaps 1840(%rdi), %xmm1
6273 ; SSE-NEXT: movaps %xmm1, %xmm6
6274 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0]
6275 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
6276 ; SSE-NEXT: movaps 2032(%rdi), %xmm0
6277 ; SSE-NEXT: movaps 1968(%rdi), %xmm2
6278 ; SSE-NEXT: movaps %xmm2, %xmm3
6279 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0]
6280 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
6281 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6282 ; SSE-NEXT: movaps %xmm0, 224(%rsi)
6283 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6284 ; SSE-NEXT: movaps %xmm0, 160(%rsi)
6285 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6286 ; SSE-NEXT: movaps %xmm0, 96(%rsi)
6287 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6288 ; SSE-NEXT: movaps %xmm0, 32(%rsi)
6289 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6290 ; SSE-NEXT: movaps %xmm0, 240(%rsi)
6291 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6292 ; SSE-NEXT: movaps %xmm0, 176(%rsi)
6293 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6294 ; SSE-NEXT: movaps %xmm0, 112(%rsi)
6295 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6296 ; SSE-NEXT: movaps %xmm0, 48(%rsi)
6297 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6298 ; SSE-NEXT: movaps %xmm0, 192(%rsi)
6299 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6300 ; SSE-NEXT: movaps %xmm0, 128(%rsi)
6301 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6302 ; SSE-NEXT: movaps %xmm0, 64(%rsi)
6303 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6304 ; SSE-NEXT: movaps %xmm0, (%rsi)
6305 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6306 ; SSE-NEXT: movaps %xmm0, 208(%rsi)
6307 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6308 ; SSE-NEXT: movaps %xmm0, 144(%rsi)
6309 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6310 ; SSE-NEXT: movaps %xmm0, 80(%rsi)
6311 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6312 ; SSE-NEXT: movaps %xmm0, 16(%rsi)
6313 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6314 ; SSE-NEXT: movaps %xmm0, 224(%rdx)
6315 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6316 ; SSE-NEXT: movaps %xmm0, 240(%rdx)
6317 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6318 ; SSE-NEXT: movaps %xmm0, 192(%rdx)
6319 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6320 ; SSE-NEXT: movaps %xmm0, 208(%rdx)
6321 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6322 ; SSE-NEXT: movaps %xmm0, 160(%rdx)
6323 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6324 ; SSE-NEXT: movaps %xmm0, 176(%rdx)
6325 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6326 ; SSE-NEXT: movaps %xmm0, 128(%rdx)
6327 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6328 ; SSE-NEXT: movaps %xmm0, 144(%rdx)
6329 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6330 ; SSE-NEXT: movaps %xmm0, 96(%rdx)
6331 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6332 ; SSE-NEXT: movaps %xmm0, 112(%rdx)
6333 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6334 ; SSE-NEXT: movaps %xmm0, 64(%rdx)
6335 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6336 ; SSE-NEXT: movaps %xmm0, 80(%rdx)
6337 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6338 ; SSE-NEXT: movaps %xmm0, 32(%rdx)
6339 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6340 ; SSE-NEXT: movaps %xmm0, 48(%rdx)
6341 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6342 ; SSE-NEXT: movaps %xmm0, (%rdx)
6343 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6344 ; SSE-NEXT: movaps %xmm0, 16(%rdx)
6345 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6346 ; SSE-NEXT: movaps %xmm0, 240(%rcx)
6347 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6348 ; SSE-NEXT: movaps %xmm0, 224(%rcx)
6349 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6350 ; SSE-NEXT: movaps %xmm0, 208(%rcx)
6351 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6352 ; SSE-NEXT: movaps %xmm0, 192(%rcx)
6353 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6354 ; SSE-NEXT: movaps %xmm0, 176(%rcx)
6355 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6356 ; SSE-NEXT: movaps %xmm0, 160(%rcx)
6357 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6358 ; SSE-NEXT: movaps %xmm0, 144(%rcx)
6359 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6360 ; SSE-NEXT: movaps %xmm0, 128(%rcx)
6361 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6362 ; SSE-NEXT: movaps %xmm0, 112(%rcx)
6363 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6364 ; SSE-NEXT: movaps %xmm0, 96(%rcx)
6365 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6366 ; SSE-NEXT: movaps %xmm0, 80(%rcx)
6367 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6368 ; SSE-NEXT: movaps %xmm0, 64(%rcx)
6369 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6370 ; SSE-NEXT: movaps %xmm0, 48(%rcx)
6371 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6372 ; SSE-NEXT: movaps %xmm0, 32(%rcx)
6373 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6374 ; SSE-NEXT: movaps %xmm0, 16(%rcx)
6375 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6376 ; SSE-NEXT: movaps %xmm0, (%rcx)
6377 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6378 ; SSE-NEXT: movaps %xmm0, 240(%r8)
6379 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6380 ; SSE-NEXT: movaps %xmm0, 224(%r8)
6381 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6382 ; SSE-NEXT: movaps %xmm0, 208(%r8)
6383 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6384 ; SSE-NEXT: movaps %xmm0, 192(%r8)
6385 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6386 ; SSE-NEXT: movaps %xmm0, 176(%r8)
6387 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6388 ; SSE-NEXT: movaps %xmm0, 160(%r8)
6389 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6390 ; SSE-NEXT: movaps %xmm0, 144(%r8)
6391 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6392 ; SSE-NEXT: movaps %xmm0, 128(%r8)
6393 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6394 ; SSE-NEXT: movaps %xmm0, 112(%r8)
6395 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6396 ; SSE-NEXT: movaps %xmm0, 96(%r8)
6397 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6398 ; SSE-NEXT: movaps %xmm0, 80(%r8)
6399 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6400 ; SSE-NEXT: movaps %xmm0, 64(%r8)
6401 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6402 ; SSE-NEXT: movaps %xmm0, 48(%r8)
6403 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6404 ; SSE-NEXT: movaps %xmm0, 32(%r8)
6405 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6406 ; SSE-NEXT: movaps %xmm0, 16(%r8)
6407 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6408 ; SSE-NEXT: movaps %xmm0, (%r8)
6409 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6410 ; SSE-NEXT: movaps %xmm0, 240(%r9)
6411 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6412 ; SSE-NEXT: movaps %xmm0, 224(%r9)
6413 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6414 ; SSE-NEXT: movaps %xmm0, 208(%r9)
6415 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6416 ; SSE-NEXT: movaps %xmm0, 192(%r9)
6417 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6418 ; SSE-NEXT: movaps %xmm0, 176(%r9)
6419 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6420 ; SSE-NEXT: movaps %xmm0, 160(%r9)
6421 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6422 ; SSE-NEXT: movaps %xmm0, 144(%r9)
6423 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6424 ; SSE-NEXT: movaps %xmm0, 128(%r9)
6425 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6426 ; SSE-NEXT: movaps %xmm0, 112(%r9)
6427 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6428 ; SSE-NEXT: movaps %xmm0, 96(%r9)
6429 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6430 ; SSE-NEXT: movaps %xmm0, 80(%r9)
6431 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6432 ; SSE-NEXT: movaps %xmm0, 64(%r9)
6433 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6434 ; SSE-NEXT: movaps %xmm0, 48(%r9)
6435 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6436 ; SSE-NEXT: movaps %xmm0, 32(%r9)
6437 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6438 ; SSE-NEXT: movaps %xmm0, 16(%r9)
6439 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6440 ; SSE-NEXT: movaps %xmm0, (%r9)
6441 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
6442 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6443 ; SSE-NEXT: movaps %xmm0, 240(%rax)
6444 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6445 ; SSE-NEXT: movaps %xmm0, 224(%rax)
6446 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6447 ; SSE-NEXT: movaps %xmm0, 208(%rax)
6448 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6449 ; SSE-NEXT: movaps %xmm0, 192(%rax)
6450 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6451 ; SSE-NEXT: movaps %xmm0, 176(%rax)
6452 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6453 ; SSE-NEXT: movaps %xmm0, 160(%rax)
6454 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6455 ; SSE-NEXT: movaps %xmm0, 144(%rax)
6456 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6457 ; SSE-NEXT: movaps %xmm0, 128(%rax)
6458 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6459 ; SSE-NEXT: movaps %xmm0, 112(%rax)
6460 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6461 ; SSE-NEXT: movaps %xmm0, 96(%rax)
6462 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6463 ; SSE-NEXT: movaps %xmm0, 80(%rax)
6464 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6465 ; SSE-NEXT: movaps %xmm0, 64(%rax)
6466 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6467 ; SSE-NEXT: movaps %xmm0, 48(%rax)
6468 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6469 ; SSE-NEXT: movaps %xmm0, 32(%rax)
6470 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6471 ; SSE-NEXT: movaps %xmm0, 16(%rax)
6472 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6473 ; SSE-NEXT: movaps %xmm0, (%rax)
6474 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
6475 ; SSE-NEXT: movaps %xmm3, 240(%rax)
6476 ; SSE-NEXT: movaps %xmm6, 224(%rax)
6477 ; SSE-NEXT: movaps %xmm8, 208(%rax)
6478 ; SSE-NEXT: movaps %xmm9, 192(%rax)
6479 ; SSE-NEXT: movaps %xmm10, 176(%rax)
6480 ; SSE-NEXT: movaps %xmm15, 160(%rax)
6481 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6482 ; SSE-NEXT: movaps %xmm0, 144(%rax)
6483 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6484 ; SSE-NEXT: movaps %xmm0, 128(%rax)
6485 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6486 ; SSE-NEXT: movaps %xmm0, 112(%rax)
6487 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6488 ; SSE-NEXT: movaps %xmm0, 96(%rax)
6489 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6490 ; SSE-NEXT: movaps %xmm0, 80(%rax)
6491 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6492 ; SSE-NEXT: movaps %xmm0, 64(%rax)
6493 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6494 ; SSE-NEXT: movaps %xmm0, 48(%rax)
6495 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6496 ; SSE-NEXT: movaps %xmm0, 32(%rax)
6497 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6498 ; SSE-NEXT: movaps %xmm0, 16(%rax)
6499 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6500 ; SSE-NEXT: movaps %xmm0, (%rax)
6501 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
6502 ; SSE-NEXT: movaps %xmm2, 240(%rax)
6503 ; SSE-NEXT: movaps %xmm1, 224(%rax)
6504 ; SSE-NEXT: movaps %xmm4, 208(%rax)
6505 ; SSE-NEXT: movaps %xmm5, 192(%rax)
6506 ; SSE-NEXT: movaps %xmm7, 176(%rax)
6507 ; SSE-NEXT: movaps %xmm13, 160(%rax)
6508 ; SSE-NEXT: movaps %xmm14, 144(%rax)
6509 ; SSE-NEXT: movaps %xmm11, 128(%rax)
6510 ; SSE-NEXT: movaps %xmm12, 112(%rax)
6511 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6512 ; SSE-NEXT: movaps %xmm0, 96(%rax)
6513 ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
6514 ; SSE-NEXT: movaps %xmm0, 80(%rax)
6515 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6516 ; SSE-NEXT: movaps %xmm0, 64(%rax)
6517 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6518 ; SSE-NEXT: movaps %xmm0, 48(%rax)
6519 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6520 ; SSE-NEXT: movaps %xmm0, 32(%rax)
6521 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6522 ; SSE-NEXT: movaps %xmm0, 16(%rax)
6523 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6524 ; SSE-NEXT: movaps %xmm0, (%rax)
6525 ; SSE-NEXT: addq $1688, %rsp # imm = 0x698
6528 ; AVX-LABEL: load_i64_stride8_vf32:
6530 ; AVX-NEXT: subq $2216, %rsp # imm = 0x8A8
6531 ; AVX-NEXT: vmovaps 192(%rdi), %xmm2
6532 ; AVX-NEXT: vmovaps 128(%rdi), %xmm3
6533 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm2[0]
6534 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6535 ; AVX-NEXT: vmovaps 704(%rdi), %xmm0
6536 ; AVX-NEXT: vmovaps 640(%rdi), %xmm1
6537 ; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm1[0],xmm0[0]
6538 ; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6539 ; AVX-NEXT: vmovaps 1216(%rdi), %xmm4
6540 ; AVX-NEXT: vmovaps 1152(%rdi), %xmm6
6541 ; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm6[0],xmm4[0]
6542 ; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6543 ; AVX-NEXT: vmovaps 64(%rdi), %xmm8
6544 ; AVX-NEXT: vmovaps (%rdi), %xmm9
6545 ; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm9[0],xmm8[0]
6546 ; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6547 ; AVX-NEXT: vmovaps 576(%rdi), %xmm5
6548 ; AVX-NEXT: vmovaps 512(%rdi), %xmm7
6549 ; AVX-NEXT: vmovlhps {{.*#+}} xmm10 = xmm7[0],xmm5[0]
6550 ; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6551 ; AVX-NEXT: vmovaps 1088(%rdi), %xmm10
6552 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1]
6553 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6554 ; AVX-NEXT: vmovaps 1024(%rdi), %xmm2
6555 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm9[1],xmm8[1]
6556 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6557 ; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm10[0]
6558 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6559 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm4[1]
6560 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6561 ; AVX-NEXT: vmovaps 1600(%rdi), %xmm3
6562 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm10[1]
6563 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6564 ; AVX-NEXT: vmovaps 1728(%rdi), %xmm2
6565 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
6566 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6567 ; AVX-NEXT: vmovaps 1664(%rdi), %xmm0
6568 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm5[1]
6569 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6570 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm2[0]
6571 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6572 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
6573 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6574 ; AVX-NEXT: vmovaps 1536(%rdi), %xmm0
6575 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm3[0]
6576 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6577 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
6578 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6579 ; AVX-NEXT: vmovaps 448(%rdi), %xmm0
6580 ; AVX-NEXT: vmovaps 384(%rdi), %xmm1
6581 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
6582 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6583 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
6584 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6585 ; AVX-NEXT: vmovaps 320(%rdi), %xmm0
6586 ; AVX-NEXT: vmovaps 256(%rdi), %xmm1
6587 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
6588 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6589 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
6590 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6591 ; AVX-NEXT: vmovaps 1472(%rdi), %xmm0
6592 ; AVX-NEXT: vmovaps 1408(%rdi), %xmm1
6593 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
6594 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6595 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
6596 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6597 ; AVX-NEXT: vmovaps 1344(%rdi), %xmm0
6598 ; AVX-NEXT: vmovaps 1280(%rdi), %xmm1
6599 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
6600 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6601 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
6602 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6603 ; AVX-NEXT: vmovaps 960(%rdi), %xmm0
6604 ; AVX-NEXT: vmovaps 896(%rdi), %xmm1
6605 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
6606 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6607 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
6608 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6609 ; AVX-NEXT: vmovaps 832(%rdi), %xmm0
6610 ; AVX-NEXT: vmovaps 768(%rdi), %xmm1
6611 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
6612 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6613 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
6614 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6615 ; AVX-NEXT: vmovaps 1984(%rdi), %xmm0
6616 ; AVX-NEXT: vmovaps 1920(%rdi), %xmm1
6617 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
6618 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6619 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
6620 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6621 ; AVX-NEXT: vmovaps 1856(%rdi), %xmm0
6622 ; AVX-NEXT: vmovaps 1792(%rdi), %xmm1
6623 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
6624 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6625 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
6626 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6627 ; AVX-NEXT: vmovaps 192(%rdi), %ymm1
6628 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6629 ; AVX-NEXT: vmovaps 128(%rdi), %ymm0
6630 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6631 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
6632 ; AVX-NEXT: vmovaps 80(%rdi), %xmm2
6633 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6634 ; AVX-NEXT: vmovaps 16(%rdi), %xmm1
6635 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6636 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
6637 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6638 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6639 ; AVX-NEXT: vmovaps 704(%rdi), %ymm1
6640 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6641 ; AVX-NEXT: vmovaps 640(%rdi), %ymm0
6642 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6643 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
6644 ; AVX-NEXT: vmovaps 592(%rdi), %xmm2
6645 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6646 ; AVX-NEXT: vmovaps 528(%rdi), %xmm1
6647 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6648 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
6649 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6650 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6651 ; AVX-NEXT: vmovaps 1216(%rdi), %ymm1
6652 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6653 ; AVX-NEXT: vmovaps 1152(%rdi), %ymm0
6654 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6655 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
6656 ; AVX-NEXT: vmovaps 1104(%rdi), %xmm2
6657 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6658 ; AVX-NEXT: vmovaps 1040(%rdi), %xmm1
6659 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6660 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
6661 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6662 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6663 ; AVX-NEXT: vmovaps 1728(%rdi), %ymm1
6664 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6665 ; AVX-NEXT: vmovaps 1664(%rdi), %ymm0
6666 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6667 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
6668 ; AVX-NEXT: vmovaps 1616(%rdi), %xmm14
6669 ; AVX-NEXT: vmovaps 1552(%rdi), %xmm12
6670 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm14[0]
6671 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6672 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6673 ; AVX-NEXT: vmovaps 1984(%rdi), %ymm1
6674 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6675 ; AVX-NEXT: vmovaps 1920(%rdi), %ymm0
6676 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6677 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
6678 ; AVX-NEXT: vmovaps 1872(%rdi), %xmm2
6679 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6680 ; AVX-NEXT: vmovaps 1808(%rdi), %xmm1
6681 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6682 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
6683 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6684 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6685 ; AVX-NEXT: vmovaps 1472(%rdi), %ymm11
6686 ; AVX-NEXT: vmovaps 1408(%rdi), %ymm10
6687 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm11[0],ymm10[2],ymm11[2]
6688 ; AVX-NEXT: vmovaps 1360(%rdi), %xmm9
6689 ; AVX-NEXT: vmovaps 1296(%rdi), %xmm8
6690 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm9[0]
6691 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6692 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6693 ; AVX-NEXT: vmovaps 960(%rdi), %ymm7
6694 ; AVX-NEXT: vmovaps 896(%rdi), %ymm6
6695 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm7[0],ymm6[2],ymm7[2]
6696 ; AVX-NEXT: vmovaps 848(%rdi), %xmm5
6697 ; AVX-NEXT: vmovaps 784(%rdi), %xmm4
6698 ; AVX-NEXT: vmovlhps {{.*#+}} xmm13 = xmm4[0],xmm5[0]
6699 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
6700 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6701 ; AVX-NEXT: vmovaps 448(%rdi), %ymm3
6702 ; AVX-NEXT: vmovaps 384(%rdi), %ymm13
6703 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm3[0],ymm13[2],ymm3[2]
6704 ; AVX-NEXT: vmovaps 336(%rdi), %xmm2
6705 ; AVX-NEXT: vmovaps 272(%rdi), %xmm1
6706 ; AVX-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0]
6707 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
6708 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6709 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6710 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
6711 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
6712 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
6713 ; AVX-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
6714 ; AVX-NEXT: # xmm15 = mem[0,1],xmm15[2,3]
6715 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
6716 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6717 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6718 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
6719 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
6720 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
6721 ; AVX-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
6722 ; AVX-NEXT: # xmm15 = mem[0,1],xmm15[2,3]
6723 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
6724 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6725 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6726 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
6727 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
6728 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
6729 ; AVX-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
6730 ; AVX-NEXT: # xmm15 = mem[0,1],xmm15[2,3]
6731 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
6732 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6733 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6734 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
6735 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
6736 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm12[1],xmm14[1]
6737 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
6738 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6739 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm3[1],ymm13[3],ymm3[3]
6740 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
6741 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6742 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6743 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm7[1],ymm6[3],ymm7[3]
6744 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm5[1]
6745 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6746 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6747 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm11[1],ymm10[3],ymm11[3]
6748 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm9[1]
6749 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6750 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6751 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6752 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
6753 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
6754 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6755 ; AVX-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
6756 ; AVX-NEXT: # xmm1 = mem[0,1],xmm1[2,3]
6757 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6758 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6759 ; AVX-NEXT: vmovaps 480(%rdi), %xmm0
6760 ; AVX-NEXT: vmovaps 416(%rdi), %xmm1
6761 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
6762 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6763 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
6764 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6765 ; AVX-NEXT: vmovaps 352(%rdi), %xmm0
6766 ; AVX-NEXT: vmovaps 288(%rdi), %xmm1
6767 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
6768 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6769 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
6770 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6771 ; AVX-NEXT: vmovaps 992(%rdi), %xmm0
6772 ; AVX-NEXT: vmovaps 928(%rdi), %xmm1
6773 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
6774 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6775 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
6776 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6777 ; AVX-NEXT: vmovaps 864(%rdi), %xmm0
6778 ; AVX-NEXT: vmovaps 800(%rdi), %xmm1
6779 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
6780 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6781 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
6782 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6783 ; AVX-NEXT: vmovaps 1120(%rdi), %xmm0
6784 ; AVX-NEXT: vmovaps 1056(%rdi), %xmm1
6785 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
6786 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6787 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
6788 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6789 ; AVX-NEXT: vmovaps 1248(%rdi), %xmm0
6790 ; AVX-NEXT: vmovaps 1184(%rdi), %xmm1
6791 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
6792 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6793 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
6794 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6795 ; AVX-NEXT: vmovaps 1504(%rdi), %xmm0
6796 ; AVX-NEXT: vmovaps 1440(%rdi), %xmm1
6797 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
6798 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6799 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
6800 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6801 ; AVX-NEXT: vmovaps 1376(%rdi), %xmm0
6802 ; AVX-NEXT: vmovaps 1312(%rdi), %xmm1
6803 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
6804 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6805 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
6806 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6807 ; AVX-NEXT: vmovaps 736(%rdi), %xmm0
6808 ; AVX-NEXT: vmovaps 672(%rdi), %xmm1
6809 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
6810 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6811 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
6812 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6813 ; AVX-NEXT: vmovaps 608(%rdi), %xmm0
6814 ; AVX-NEXT: vmovaps 544(%rdi), %xmm1
6815 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
6816 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6817 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
6818 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6819 ; AVX-NEXT: vmovaps 224(%rdi), %xmm0
6820 ; AVX-NEXT: vmovaps 160(%rdi), %xmm1
6821 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
6822 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6823 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
6824 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6825 ; AVX-NEXT: vmovaps 96(%rdi), %xmm0
6826 ; AVX-NEXT: vmovaps 32(%rdi), %xmm1
6827 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
6828 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6829 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
6830 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6831 ; AVX-NEXT: vmovaps 1632(%rdi), %xmm0
6832 ; AVX-NEXT: vmovaps 1568(%rdi), %xmm1
6833 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
6834 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6835 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
6836 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6837 ; AVX-NEXT: vmovaps 1760(%rdi), %xmm0
6838 ; AVX-NEXT: vmovaps 1696(%rdi), %xmm1
6839 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
6840 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6841 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
6842 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6843 ; AVX-NEXT: vmovaps 1888(%rdi), %xmm0
6844 ; AVX-NEXT: vmovaps 1824(%rdi), %xmm1
6845 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
6846 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6847 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
6848 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6849 ; AVX-NEXT: vmovaps 2016(%rdi), %xmm0
6850 ; AVX-NEXT: vmovaps 1952(%rdi), %xmm1
6851 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
6852 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6853 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
6854 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6855 ; AVX-NEXT: vmovaps 224(%rdi), %ymm1
6856 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6857 ; AVX-NEXT: vmovaps 160(%rdi), %ymm0
6858 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6859 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
6860 ; AVX-NEXT: vmovaps 112(%rdi), %xmm2
6861 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6862 ; AVX-NEXT: vmovaps 48(%rdi), %xmm1
6863 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6864 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
6865 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6866 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6867 ; AVX-NEXT: vmovaps 480(%rdi), %ymm1
6868 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6869 ; AVX-NEXT: vmovaps 416(%rdi), %ymm0
6870 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6871 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
6872 ; AVX-NEXT: vmovaps 368(%rdi), %xmm2
6873 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6874 ; AVX-NEXT: vmovaps 304(%rdi), %xmm1
6875 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6876 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
6877 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6878 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6879 ; AVX-NEXT: vmovaps 736(%rdi), %ymm1
6880 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6881 ; AVX-NEXT: vmovaps 672(%rdi), %ymm0
6882 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6883 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
6884 ; AVX-NEXT: vmovaps 624(%rdi), %xmm2
6885 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6886 ; AVX-NEXT: vmovaps 560(%rdi), %xmm1
6887 ; AVX-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill
6888 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
6889 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6890 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6891 ; AVX-NEXT: vmovaps 992(%rdi), %ymm1
6892 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6893 ; AVX-NEXT: vmovaps 928(%rdi), %ymm0
6894 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6895 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
6896 ; AVX-NEXT: vmovaps 880(%rdi), %xmm2
6897 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6898 ; AVX-NEXT: vmovaps 816(%rdi), %xmm1
6899 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6900 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
6901 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6902 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6903 ; AVX-NEXT: vmovaps 1248(%rdi), %ymm1
6904 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6905 ; AVX-NEXT: vmovaps 1184(%rdi), %ymm0
6906 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6907 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
6908 ; AVX-NEXT: vmovaps 1136(%rdi), %xmm14
6909 ; AVX-NEXT: vmovaps 1072(%rdi), %xmm13
6910 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0]
6911 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6912 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6913 ; AVX-NEXT: vmovaps 1504(%rdi), %ymm12
6914 ; AVX-NEXT: vmovaps 1440(%rdi), %ymm11
6915 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2]
6916 ; AVX-NEXT: vmovaps 1392(%rdi), %xmm10
6917 ; AVX-NEXT: vmovaps 1328(%rdi), %xmm9
6918 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm10[0]
6919 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6920 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6921 ; AVX-NEXT: vmovaps 1760(%rdi), %ymm8
6922 ; AVX-NEXT: vmovaps 1696(%rdi), %ymm7
6923 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2]
6924 ; AVX-NEXT: vmovaps 1648(%rdi), %xmm6
6925 ; AVX-NEXT: vmovaps 1584(%rdi), %xmm5
6926 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm6[0]
6927 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6928 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6929 ; AVX-NEXT: vmovaps 2016(%rdi), %ymm4
6930 ; AVX-NEXT: vmovaps 1952(%rdi), %ymm3
6931 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
6932 ; AVX-NEXT: vmovaps 1904(%rdi), %xmm2
6933 ; AVX-NEXT: vmovaps 1840(%rdi), %xmm1
6934 ; AVX-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0]
6935 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
6936 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6937 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6938 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
6939 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
6940 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
6941 ; AVX-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
6942 ; AVX-NEXT: # xmm15 = mem[0,1],xmm15[2,3]
6943 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
6944 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6945 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6946 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
6947 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
6948 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
6949 ; AVX-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
6950 ; AVX-NEXT: # xmm15 = mem[0,1],xmm15[2,3]
6951 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
6952 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6953 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6954 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
6955 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
6956 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
6957 ; AVX-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
6958 ; AVX-NEXT: # xmm15 = mem[0,1],xmm15[2,3]
6959 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
6960 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6961 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6962 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
6963 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
6964 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
6965 ; AVX-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
6966 ; AVX-NEXT: # xmm15 = mem[0,1],xmm15[2,3]
6967 ; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7]
6968 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6969 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
6970 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
6971 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm13[1],xmm14[1]
6972 ; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7]
6973 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm12[1],ymm11[3],ymm12[3]
6974 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1]
6975 ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7]
6976 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3]
6977 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1]
6978 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
6979 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3]
6980 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
6981 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6982 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6983 ; AVX-NEXT: vmovaps %xmm1, 240(%rsi)
6984 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6985 ; AVX-NEXT: vmovaps %xmm1, 224(%rsi)
6986 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6987 ; AVX-NEXT: vmovaps %xmm1, 32(%rsi)
6988 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6989 ; AVX-NEXT: vmovaps %xmm1, 160(%rsi)
6990 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6991 ; AVX-NEXT: vmovaps %xmm1, 96(%rsi)
6992 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6993 ; AVX-NEXT: vmovaps %xmm1, 48(%rsi)
6994 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6995 ; AVX-NEXT: vmovaps %xmm1, 176(%rsi)
6996 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6997 ; AVX-NEXT: vmovaps %xmm1, 112(%rsi)
6998 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6999 ; AVX-NEXT: vmovaps %xmm1, 208(%rsi)
7000 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7001 ; AVX-NEXT: vmovaps %xmm1, 192(%rsi)
7002 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7003 ; AVX-NEXT: vmovaps %xmm1, 128(%rsi)
7004 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7005 ; AVX-NEXT: vmovaps %xmm1, 64(%rsi)
7006 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7007 ; AVX-NEXT: vmovaps %xmm1, (%rsi)
7008 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7009 ; AVX-NEXT: vmovaps %xmm1, 144(%rsi)
7010 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7011 ; AVX-NEXT: vmovaps %xmm1, 80(%rsi)
7012 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7013 ; AVX-NEXT: vmovaps %xmm1, 16(%rsi)
7014 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7015 ; AVX-NEXT: vmovaps %xmm1, 224(%rdx)
7016 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7017 ; AVX-NEXT: vmovaps %xmm1, 240(%rdx)
7018 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7019 ; AVX-NEXT: vmovaps %xmm1, 96(%rdx)
7020 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7021 ; AVX-NEXT: vmovaps %xmm1, 112(%rdx)
7022 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7023 ; AVX-NEXT: vmovaps %xmm1, 160(%rdx)
7024 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7025 ; AVX-NEXT: vmovaps %xmm1, 176(%rdx)
7026 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7027 ; AVX-NEXT: vmovaps %xmm1, 32(%rdx)
7028 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7029 ; AVX-NEXT: vmovaps %xmm1, 48(%rdx)
7030 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7031 ; AVX-NEXT: vmovaps %xmm1, 192(%rdx)
7032 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7033 ; AVX-NEXT: vmovaps %xmm1, 208(%rdx)
7034 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7035 ; AVX-NEXT: vmovaps %xmm1, 64(%rdx)
7036 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7037 ; AVX-NEXT: vmovaps %xmm1, 80(%rdx)
7038 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7039 ; AVX-NEXT: vmovaps %xmm1, 128(%rdx)
7040 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7041 ; AVX-NEXT: vmovaps %xmm1, 144(%rdx)
7042 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7043 ; AVX-NEXT: vmovaps %xmm1, (%rdx)
7044 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7045 ; AVX-NEXT: vmovaps %xmm1, 16(%rdx)
7046 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7047 ; AVX-NEXT: vmovaps %ymm1, 32(%rcx)
7048 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7049 ; AVX-NEXT: vmovaps %ymm1, 96(%rcx)
7050 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7051 ; AVX-NEXT: vmovaps %ymm1, 160(%rcx)
7052 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7053 ; AVX-NEXT: vmovaps %ymm1, 224(%rcx)
7054 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7055 ; AVX-NEXT: vmovaps %ymm1, 192(%rcx)
7056 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7057 ; AVX-NEXT: vmovaps %ymm1, 128(%rcx)
7058 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7059 ; AVX-NEXT: vmovaps %ymm1, 64(%rcx)
7060 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7061 ; AVX-NEXT: vmovaps %ymm1, (%rcx)
7062 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7063 ; AVX-NEXT: vmovaps %ymm1, 224(%r8)
7064 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7065 ; AVX-NEXT: vmovaps %ymm1, 160(%r8)
7066 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7067 ; AVX-NEXT: vmovaps %ymm1, 96(%r8)
7068 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7069 ; AVX-NEXT: vmovaps %ymm1, 32(%r8)
7070 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7071 ; AVX-NEXT: vmovaps %ymm1, 192(%r8)
7072 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7073 ; AVX-NEXT: vmovaps %ymm1, 128(%r8)
7074 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7075 ; AVX-NEXT: vmovaps %ymm1, 64(%r8)
7076 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7077 ; AVX-NEXT: vmovaps %ymm1, (%r8)
7078 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7079 ; AVX-NEXT: vmovaps %xmm1, 240(%r9)
7080 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7081 ; AVX-NEXT: vmovaps %xmm1, 224(%r9)
7082 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7083 ; AVX-NEXT: vmovaps %xmm1, 48(%r9)
7084 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7085 ; AVX-NEXT: vmovaps %xmm1, 160(%r9)
7086 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7087 ; AVX-NEXT: vmovaps %xmm1, 176(%r9)
7088 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7089 ; AVX-NEXT: vmovaps %xmm1, 96(%r9)
7090 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7091 ; AVX-NEXT: vmovaps %xmm1, 112(%r9)
7092 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7093 ; AVX-NEXT: vmovaps %xmm1, 32(%r9)
7094 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7095 ; AVX-NEXT: vmovaps %xmm1, 16(%r9)
7096 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7097 ; AVX-NEXT: vmovaps %xmm1, (%r9)
7098 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7099 ; AVX-NEXT: vmovaps %xmm1, 192(%r9)
7100 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7101 ; AVX-NEXT: vmovaps %xmm1, 208(%r9)
7102 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7103 ; AVX-NEXT: vmovaps %xmm1, 128(%r9)
7104 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7105 ; AVX-NEXT: vmovaps %xmm1, 144(%r9)
7106 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7107 ; AVX-NEXT: vmovaps %xmm1, 64(%r9)
7108 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7109 ; AVX-NEXT: vmovaps %xmm1, 80(%r9)
7110 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
7111 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7112 ; AVX-NEXT: vmovaps %xmm1, 240(%rax)
7113 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7114 ; AVX-NEXT: vmovaps %xmm1, 224(%rax)
7115 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7116 ; AVX-NEXT: vmovaps %xmm1, 208(%rax)
7117 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7118 ; AVX-NEXT: vmovaps %xmm1, 192(%rax)
7119 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7120 ; AVX-NEXT: vmovaps %xmm1, (%rax)
7121 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7122 ; AVX-NEXT: vmovaps %xmm1, 16(%rax)
7123 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7124 ; AVX-NEXT: vmovaps %xmm1, 64(%rax)
7125 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7126 ; AVX-NEXT: vmovaps %xmm1, 80(%rax)
7127 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7128 ; AVX-NEXT: vmovaps %xmm1, 160(%rax)
7129 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7130 ; AVX-NEXT: vmovaps %xmm1, 176(%rax)
7131 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7132 ; AVX-NEXT: vmovaps %xmm1, 144(%rax)
7133 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7134 ; AVX-NEXT: vmovaps %xmm1, 128(%rax)
7135 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7136 ; AVX-NEXT: vmovaps %xmm1, 96(%rax)
7137 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7138 ; AVX-NEXT: vmovaps %xmm1, 112(%rax)
7139 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7140 ; AVX-NEXT: vmovaps %xmm1, 32(%rax)
7141 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7142 ; AVX-NEXT: vmovaps %xmm1, 48(%rax)
7143 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
7144 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7145 ; AVX-NEXT: vmovaps %ymm1, 224(%rax)
7146 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7147 ; AVX-NEXT: vmovaps %ymm1, 192(%rax)
7148 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7149 ; AVX-NEXT: vmovaps %ymm1, 160(%rax)
7150 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7151 ; AVX-NEXT: vmovaps %ymm1, 128(%rax)
7152 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7153 ; AVX-NEXT: vmovaps %ymm1, 96(%rax)
7154 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7155 ; AVX-NEXT: vmovaps %ymm1, 64(%rax)
7156 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7157 ; AVX-NEXT: vmovaps %ymm1, 32(%rax)
7158 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7159 ; AVX-NEXT: vmovaps %ymm1, (%rax)
7160 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
7161 ; AVX-NEXT: vmovaps %ymm0, 224(%rax)
7162 ; AVX-NEXT: vmovaps %ymm5, 192(%rax)
7163 ; AVX-NEXT: vmovaps %ymm9, 160(%rax)
7164 ; AVX-NEXT: vmovaps %ymm13, 128(%rax)
7165 ; AVX-NEXT: vmovaps %ymm15, 96(%rax)
7166 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7167 ; AVX-NEXT: vmovaps %ymm0, 64(%rax)
7168 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7169 ; AVX-NEXT: vmovaps %ymm0, 32(%rax)
7170 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7171 ; AVX-NEXT: vmovaps %ymm0, (%rax)
7172 ; AVX-NEXT: addq $2216, %rsp # imm = 0x8A8
7173 ; AVX-NEXT: vzeroupper
7176 ; AVX2-LABEL: load_i64_stride8_vf32:
7178 ; AVX2-NEXT: subq $2248, %rsp # imm = 0x8C8
7179 ; AVX2-NEXT: vmovaps 320(%rdi), %xmm0
7180 ; AVX2-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0
7181 ; AVX2-NEXT: vmovaps 256(%rdi), %xmm1
7182 ; AVX2-NEXT: vinsertf128 $1, 384(%rdi), %ymm1, %ymm1
7183 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
7184 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7185 ; AVX2-NEXT: vmovaps 832(%rdi), %xmm2
7186 ; AVX2-NEXT: vinsertf128 $1, 960(%rdi), %ymm2, %ymm2
7187 ; AVX2-NEXT: vmovaps 768(%rdi), %xmm3
7188 ; AVX2-NEXT: vinsertf128 $1, 896(%rdi), %ymm3, %ymm3
7189 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
7190 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7191 ; AVX2-NEXT: vmovaps 1344(%rdi), %xmm4
7192 ; AVX2-NEXT: vinsertf128 $1, 1472(%rdi), %ymm4, %ymm4
7193 ; AVX2-NEXT: vmovaps 1280(%rdi), %xmm5
7194 ; AVX2-NEXT: vinsertf128 $1, 1408(%rdi), %ymm5, %ymm5
7195 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
7196 ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7197 ; AVX2-NEXT: vmovaps 1856(%rdi), %xmm6
7198 ; AVX2-NEXT: vinsertf128 $1, 1984(%rdi), %ymm6, %ymm6
7199 ; AVX2-NEXT: vmovaps 1792(%rdi), %xmm7
7200 ; AVX2-NEXT: vinsertf128 $1, 1920(%rdi), %ymm7, %ymm7
7201 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2]
7202 ; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7203 ; AVX2-NEXT: vmovaps (%rdi), %xmm9
7204 ; AVX2-NEXT: vmovaps 64(%rdi), %xmm8
7205 ; AVX2-NEXT: vinsertf128 $1, 192(%rdi), %ymm8, %ymm8
7206 ; AVX2-NEXT: vinsertf128 $1, 128(%rdi), %ymm9, %ymm9
7207 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
7208 ; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7209 ; AVX2-NEXT: vmovaps 576(%rdi), %xmm10
7210 ; AVX2-NEXT: vinsertf128 $1, 704(%rdi), %ymm10, %ymm10
7211 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
7212 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7213 ; AVX2-NEXT: vmovaps 512(%rdi), %xmm0
7214 ; AVX2-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm0
7215 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
7216 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7217 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm10[0],ymm0[2],ymm10[2]
7218 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7219 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3]
7220 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7221 ; AVX2-NEXT: vmovaps 1088(%rdi), %xmm1
7222 ; AVX2-NEXT: vinsertf128 $1, 1216(%rdi), %ymm1, %ymm1
7223 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm7[1],ymm6[1],ymm7[3],ymm6[3]
7224 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7225 ; AVX2-NEXT: vmovaps 1024(%rdi), %xmm2
7226 ; AVX2-NEXT: vinsertf128 $1, 1152(%rdi), %ymm2, %ymm2
7227 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
7228 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7229 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
7230 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7231 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3]
7232 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7233 ; AVX2-NEXT: vmovaps 1600(%rdi), %xmm0
7234 ; AVX2-NEXT: vinsertf128 $1, 1728(%rdi), %ymm0, %ymm0
7235 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
7236 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7237 ; AVX2-NEXT: vmovaps 1536(%rdi), %xmm1
7238 ; AVX2-NEXT: vinsertf128 $1, 1664(%rdi), %ymm1, %ymm1
7239 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
7240 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7241 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
7242 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7243 ; AVX2-NEXT: vmovaps 320(%rdi), %ymm2
7244 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7245 ; AVX2-NEXT: vmovaps 256(%rdi), %ymm3
7246 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7247 ; AVX2-NEXT: vmovaps 448(%rdi), %ymm1
7248 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7249 ; AVX2-NEXT: vmovaps 384(%rdi), %ymm0
7250 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7251 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
7252 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
7253 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
7254 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7255 ; AVX2-NEXT: vmovaps 832(%rdi), %ymm2
7256 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7257 ; AVX2-NEXT: vmovaps 768(%rdi), %ymm3
7258 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7259 ; AVX2-NEXT: vmovaps 960(%rdi), %ymm1
7260 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7261 ; AVX2-NEXT: vmovaps 896(%rdi), %ymm0
7262 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7263 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
7264 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
7265 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
7266 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7267 ; AVX2-NEXT: vmovaps 1344(%rdi), %ymm2
7268 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7269 ; AVX2-NEXT: vmovaps 1280(%rdi), %ymm3
7270 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7271 ; AVX2-NEXT: vmovaps 1472(%rdi), %ymm1
7272 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7273 ; AVX2-NEXT: vmovaps 1408(%rdi), %ymm0
7274 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7275 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
7276 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
7277 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
7278 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7279 ; AVX2-NEXT: vmovaps 1856(%rdi), %ymm2
7280 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7281 ; AVX2-NEXT: vmovaps 1792(%rdi), %ymm3
7282 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7283 ; AVX2-NEXT: vmovaps 1984(%rdi), %ymm1
7284 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7285 ; AVX2-NEXT: vmovaps 1920(%rdi), %ymm0
7286 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7287 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
7288 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
7289 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
7290 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7291 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm2
7292 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7293 ; AVX2-NEXT: vmovaps (%rdi), %ymm1
7294 ; AVX2-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill
7295 ; AVX2-NEXT: vmovaps 192(%rdi), %ymm14
7296 ; AVX2-NEXT: vmovaps 128(%rdi), %ymm12
7297 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm14[0],ymm12[2],ymm14[2]
7298 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
7299 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
7300 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7301 ; AVX2-NEXT: vmovaps 576(%rdi), %ymm11
7302 ; AVX2-NEXT: vmovaps 512(%rdi), %ymm10
7303 ; AVX2-NEXT: vmovaps 704(%rdi), %ymm8
7304 ; AVX2-NEXT: vmovaps 640(%rdi), %ymm7
7305 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2]
7306 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm10[0],ymm11[0],ymm10[2],ymm11[2]
7307 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
7308 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7309 ; AVX2-NEXT: vmovaps 1088(%rdi), %ymm6
7310 ; AVX2-NEXT: vmovaps 1024(%rdi), %ymm9
7311 ; AVX2-NEXT: vmovaps 1216(%rdi), %ymm5
7312 ; AVX2-NEXT: vmovaps 1152(%rdi), %ymm3
7313 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
7314 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm9[0],ymm6[0],ymm9[2],ymm6[2]
7315 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
7316 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7317 ; AVX2-NEXT: vmovaps 1600(%rdi), %ymm4
7318 ; AVX2-NEXT: vmovaps 1536(%rdi), %ymm13
7319 ; AVX2-NEXT: vmovaps 1728(%rdi), %ymm2
7320 ; AVX2-NEXT: vmovaps 1664(%rdi), %ymm1
7321 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
7322 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm13[0],ymm4[0],ymm13[2],ymm4[2]
7323 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
7324 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7325 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7326 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
7327 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
7328 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
7329 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
7330 ; AVX2-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
7331 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
7332 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7333 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7334 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
7335 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
7336 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
7337 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
7338 ; AVX2-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
7339 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
7340 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7341 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7342 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
7343 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
7344 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
7345 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
7346 ; AVX2-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
7347 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
7348 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7349 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7350 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
7351 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
7352 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
7353 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
7354 ; AVX2-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
7355 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
7356 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7357 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm14[1],ymm12[3],ymm14[3]
7358 ; AVX2-NEXT: vmovups (%rsp), %ymm12 # 32-byte Reload
7359 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
7360 ; AVX2-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3]
7361 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3]
7362 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7363 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3]
7364 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm10[1],ymm11[1],ymm10[3],ymm11[3]
7365 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm0[2,3]
7366 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7367 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
7368 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm6[1],ymm9[3],ymm6[3]
7369 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
7370 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7371 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
7372 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm13[1],ymm4[1],ymm13[3],ymm4[3]
7373 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
7374 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7375 ; AVX2-NEXT: vmovaps 96(%rdi), %xmm0
7376 ; AVX2-NEXT: vinsertf128 $1, 224(%rdi), %ymm0, %ymm0
7377 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm1
7378 ; AVX2-NEXT: vinsertf128 $1, 160(%rdi), %ymm1, %ymm1
7379 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
7380 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7381 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
7382 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7383 ; AVX2-NEXT: vmovaps 352(%rdi), %xmm0
7384 ; AVX2-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm0
7385 ; AVX2-NEXT: vmovaps 288(%rdi), %xmm1
7386 ; AVX2-NEXT: vinsertf128 $1, 416(%rdi), %ymm1, %ymm1
7387 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
7388 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7389 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
7390 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7391 ; AVX2-NEXT: vmovaps 608(%rdi), %xmm0
7392 ; AVX2-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0
7393 ; AVX2-NEXT: vmovaps 544(%rdi), %xmm1
7394 ; AVX2-NEXT: vinsertf128 $1, 672(%rdi), %ymm1, %ymm1
7395 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
7396 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7397 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
7398 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7399 ; AVX2-NEXT: vmovaps 864(%rdi), %xmm0
7400 ; AVX2-NEXT: vinsertf128 $1, 992(%rdi), %ymm0, %ymm0
7401 ; AVX2-NEXT: vmovaps 800(%rdi), %xmm1
7402 ; AVX2-NEXT: vinsertf128 $1, 928(%rdi), %ymm1, %ymm1
7403 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
7404 ; AVX2-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill
7405 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
7406 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7407 ; AVX2-NEXT: vmovaps 1120(%rdi), %xmm0
7408 ; AVX2-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm0
7409 ; AVX2-NEXT: vmovaps 1056(%rdi), %xmm1
7410 ; AVX2-NEXT: vinsertf128 $1, 1184(%rdi), %ymm1, %ymm1
7411 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
7412 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7413 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
7414 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7415 ; AVX2-NEXT: vmovaps 1376(%rdi), %xmm0
7416 ; AVX2-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0
7417 ; AVX2-NEXT: vmovaps 1312(%rdi), %xmm1
7418 ; AVX2-NEXT: vinsertf128 $1, 1440(%rdi), %ymm1, %ymm1
7419 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
7420 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7421 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
7422 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7423 ; AVX2-NEXT: vmovaps 1632(%rdi), %xmm0
7424 ; AVX2-NEXT: vinsertf128 $1, 1760(%rdi), %ymm0, %ymm0
7425 ; AVX2-NEXT: vmovaps 1568(%rdi), %xmm1
7426 ; AVX2-NEXT: vinsertf128 $1, 1696(%rdi), %ymm1, %ymm1
7427 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
7428 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7429 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
7430 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7431 ; AVX2-NEXT: vmovaps 1888(%rdi), %xmm0
7432 ; AVX2-NEXT: vinsertf128 $1, 2016(%rdi), %ymm0, %ymm0
7433 ; AVX2-NEXT: vmovaps 1824(%rdi), %xmm1
7434 ; AVX2-NEXT: vinsertf128 $1, 1952(%rdi), %ymm1, %ymm1
7435 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
7436 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7437 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
7438 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7439 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm2
7440 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7441 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm3
7442 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7443 ; AVX2-NEXT: vmovaps 224(%rdi), %ymm1
7444 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7445 ; AVX2-NEXT: vmovaps 160(%rdi), %ymm0
7446 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7447 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
7448 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
7449 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
7450 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7451 ; AVX2-NEXT: vmovaps 352(%rdi), %ymm2
7452 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7453 ; AVX2-NEXT: vmovaps 288(%rdi), %ymm3
7454 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7455 ; AVX2-NEXT: vmovaps 480(%rdi), %ymm1
7456 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7457 ; AVX2-NEXT: vmovaps 416(%rdi), %ymm0
7458 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7459 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
7460 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
7461 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
7462 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7463 ; AVX2-NEXT: vmovaps 608(%rdi), %ymm2
7464 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7465 ; AVX2-NEXT: vmovaps 544(%rdi), %ymm3
7466 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7467 ; AVX2-NEXT: vmovaps 736(%rdi), %ymm1
7468 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7469 ; AVX2-NEXT: vmovaps 672(%rdi), %ymm0
7470 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7471 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
7472 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
7473 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
7474 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7475 ; AVX2-NEXT: vmovaps 864(%rdi), %ymm2
7476 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7477 ; AVX2-NEXT: vmovaps 800(%rdi), %ymm3
7478 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7479 ; AVX2-NEXT: vmovaps 992(%rdi), %ymm1
7480 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7481 ; AVX2-NEXT: vmovaps 928(%rdi), %ymm0
7482 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7483 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
7484 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
7485 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
7486 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7487 ; AVX2-NEXT: vmovaps 1120(%rdi), %ymm2
7488 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7489 ; AVX2-NEXT: vmovaps 1056(%rdi), %ymm1
7490 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7491 ; AVX2-NEXT: vmovaps 1248(%rdi), %ymm12
7492 ; AVX2-NEXT: vmovaps 1184(%rdi), %ymm11
7493 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2]
7494 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
7495 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
7496 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7497 ; AVX2-NEXT: vmovaps 1376(%rdi), %ymm10
7498 ; AVX2-NEXT: vmovaps 1312(%rdi), %ymm9
7499 ; AVX2-NEXT: vmovaps 1504(%rdi), %ymm8
7500 ; AVX2-NEXT: vmovaps 1440(%rdi), %ymm7
7501 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2]
7502 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm9[0],ymm10[0],ymm9[2],ymm10[2]
7503 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
7504 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7505 ; AVX2-NEXT: vmovaps 1632(%rdi), %ymm6
7506 ; AVX2-NEXT: vmovaps 1568(%rdi), %ymm15
7507 ; AVX2-NEXT: vmovaps 1760(%rdi), %ymm5
7508 ; AVX2-NEXT: vmovaps 1696(%rdi), %ymm3
7509 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
7510 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm15[0],ymm6[0],ymm15[2],ymm6[2]
7511 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3]
7512 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7513 ; AVX2-NEXT: vmovaps 1888(%rdi), %ymm4
7514 ; AVX2-NEXT: vmovaps 1824(%rdi), %ymm14
7515 ; AVX2-NEXT: vmovaps 2016(%rdi), %ymm2
7516 ; AVX2-NEXT: vmovaps 1952(%rdi), %ymm1
7517 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
7518 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
7519 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
7520 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7521 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7522 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
7523 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
7524 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
7525 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
7526 ; AVX2-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
7527 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
7528 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7529 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7530 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
7531 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
7532 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
7533 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
7534 ; AVX2-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
7535 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
7536 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7537 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7538 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
7539 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
7540 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
7541 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
7542 ; AVX2-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
7543 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
7544 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7545 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7546 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
7547 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
7548 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
7549 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
7550 ; AVX2-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
7551 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm0[2,3]
7552 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm12[1],ymm11[3],ymm12[3]
7553 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
7554 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
7555 ; AVX2-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
7556 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm0[2,3]
7557 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3]
7558 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm9[1],ymm10[1],ymm9[3],ymm10[3]
7559 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm0[2,3]
7560 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
7561 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm15[1],ymm6[1],ymm15[3],ymm6[3]
7562 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[2,3]
7563 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
7564 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
7565 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
7566 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7567 ; AVX2-NEXT: vmovaps %ymm1, 192(%rsi)
7568 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7569 ; AVX2-NEXT: vmovaps %ymm1, 128(%rsi)
7570 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7571 ; AVX2-NEXT: vmovaps %ymm1, 64(%rsi)
7572 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7573 ; AVX2-NEXT: vmovaps %ymm1, (%rsi)
7574 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7575 ; AVX2-NEXT: vmovaps %ymm1, 224(%rsi)
7576 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7577 ; AVX2-NEXT: vmovaps %ymm1, 160(%rsi)
7578 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7579 ; AVX2-NEXT: vmovaps %ymm1, 96(%rsi)
7580 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7581 ; AVX2-NEXT: vmovaps %ymm1, 32(%rsi)
7582 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7583 ; AVX2-NEXT: vmovaps %ymm1, 192(%rdx)
7584 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7585 ; AVX2-NEXT: vmovaps %ymm1, 128(%rdx)
7586 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7587 ; AVX2-NEXT: vmovaps %ymm1, 64(%rdx)
7588 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7589 ; AVX2-NEXT: vmovaps %ymm1, (%rdx)
7590 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7591 ; AVX2-NEXT: vmovaps %ymm1, 224(%rdx)
7592 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7593 ; AVX2-NEXT: vmovaps %ymm1, 160(%rdx)
7594 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7595 ; AVX2-NEXT: vmovaps %ymm1, 96(%rdx)
7596 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7597 ; AVX2-NEXT: vmovaps %ymm1, 32(%rdx)
7598 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7599 ; AVX2-NEXT: vmovaps %ymm1, 192(%rcx)
7600 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7601 ; AVX2-NEXT: vmovaps %ymm1, 128(%rcx)
7602 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7603 ; AVX2-NEXT: vmovaps %ymm1, 64(%rcx)
7604 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7605 ; AVX2-NEXT: vmovaps %ymm1, (%rcx)
7606 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7607 ; AVX2-NEXT: vmovaps %ymm1, 224(%rcx)
7608 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7609 ; AVX2-NEXT: vmovaps %ymm1, 160(%rcx)
7610 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7611 ; AVX2-NEXT: vmovaps %ymm1, 96(%rcx)
7612 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7613 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx)
7614 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7615 ; AVX2-NEXT: vmovaps %ymm1, 192(%r8)
7616 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7617 ; AVX2-NEXT: vmovaps %ymm1, 128(%r8)
7618 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7619 ; AVX2-NEXT: vmovaps %ymm1, 64(%r8)
7620 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7621 ; AVX2-NEXT: vmovaps %ymm1, (%r8)
7622 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7623 ; AVX2-NEXT: vmovaps %ymm1, 224(%r8)
7624 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7625 ; AVX2-NEXT: vmovaps %ymm1, 160(%r8)
7626 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7627 ; AVX2-NEXT: vmovaps %ymm1, 96(%r8)
7628 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7629 ; AVX2-NEXT: vmovaps %ymm1, 32(%r8)
7630 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7631 ; AVX2-NEXT: vmovaps %ymm1, 224(%r9)
7632 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7633 ; AVX2-NEXT: vmovaps %ymm1, 192(%r9)
7634 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7635 ; AVX2-NEXT: vmovaps %ymm1, 160(%r9)
7636 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7637 ; AVX2-NEXT: vmovaps %ymm1, 128(%r9)
7638 ; AVX2-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
7639 ; AVX2-NEXT: vmovaps %ymm1, 96(%r9)
7640 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7641 ; AVX2-NEXT: vmovaps %ymm1, 64(%r9)
7642 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7643 ; AVX2-NEXT: vmovaps %ymm1, 32(%r9)
7644 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7645 ; AVX2-NEXT: vmovaps %ymm1, (%r9)
7646 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
7647 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7648 ; AVX2-NEXT: vmovaps %ymm1, 224(%rax)
7649 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7650 ; AVX2-NEXT: vmovaps %ymm1, 192(%rax)
7651 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7652 ; AVX2-NEXT: vmovaps %ymm1, 160(%rax)
7653 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7654 ; AVX2-NEXT: vmovaps %ymm1, 128(%rax)
7655 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7656 ; AVX2-NEXT: vmovaps %ymm1, 96(%rax)
7657 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7658 ; AVX2-NEXT: vmovaps %ymm1, 64(%rax)
7659 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7660 ; AVX2-NEXT: vmovaps %ymm1, 32(%rax)
7661 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7662 ; AVX2-NEXT: vmovaps %ymm1, (%rax)
7663 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
7664 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7665 ; AVX2-NEXT: vmovaps %ymm1, 224(%rax)
7666 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7667 ; AVX2-NEXT: vmovaps %ymm1, 192(%rax)
7668 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7669 ; AVX2-NEXT: vmovaps %ymm1, 160(%rax)
7670 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7671 ; AVX2-NEXT: vmovaps %ymm1, 128(%rax)
7672 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7673 ; AVX2-NEXT: vmovaps %ymm1, 96(%rax)
7674 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7675 ; AVX2-NEXT: vmovaps %ymm1, 64(%rax)
7676 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7677 ; AVX2-NEXT: vmovaps %ymm1, 32(%rax)
7678 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7679 ; AVX2-NEXT: vmovaps %ymm1, (%rax)
7680 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
7681 ; AVX2-NEXT: vmovaps %ymm0, 224(%rax)
7682 ; AVX2-NEXT: vmovaps %ymm3, 192(%rax)
7683 ; AVX2-NEXT: vmovaps %ymm7, 160(%rax)
7684 ; AVX2-NEXT: vmovaps %ymm11, 128(%rax)
7685 ; AVX2-NEXT: vmovaps %ymm13, 96(%rax)
7686 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7687 ; AVX2-NEXT: vmovaps %ymm0, 64(%rax)
7688 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7689 ; AVX2-NEXT: vmovaps %ymm0, 32(%rax)
7690 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7691 ; AVX2-NEXT: vmovaps %ymm0, (%rax)
7692 ; AVX2-NEXT: addq $2248, %rsp # imm = 0x8C8
7693 ; AVX2-NEXT: vzeroupper
7696 ; AVX2-FP-LABEL: load_i64_stride8_vf32:
7698 ; AVX2-FP-NEXT: subq $2248, %rsp # imm = 0x8C8
7699 ; AVX2-FP-NEXT: vmovaps 320(%rdi), %xmm0
7700 ; AVX2-FP-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0
7701 ; AVX2-FP-NEXT: vmovaps 256(%rdi), %xmm1
7702 ; AVX2-FP-NEXT: vinsertf128 $1, 384(%rdi), %ymm1, %ymm1
7703 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
7704 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7705 ; AVX2-FP-NEXT: vmovaps 832(%rdi), %xmm2
7706 ; AVX2-FP-NEXT: vinsertf128 $1, 960(%rdi), %ymm2, %ymm2
7707 ; AVX2-FP-NEXT: vmovaps 768(%rdi), %xmm3
7708 ; AVX2-FP-NEXT: vinsertf128 $1, 896(%rdi), %ymm3, %ymm3
7709 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
7710 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7711 ; AVX2-FP-NEXT: vmovaps 1344(%rdi), %xmm4
7712 ; AVX2-FP-NEXT: vinsertf128 $1, 1472(%rdi), %ymm4, %ymm4
7713 ; AVX2-FP-NEXT: vmovaps 1280(%rdi), %xmm5
7714 ; AVX2-FP-NEXT: vinsertf128 $1, 1408(%rdi), %ymm5, %ymm5
7715 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
7716 ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7717 ; AVX2-FP-NEXT: vmovaps 1856(%rdi), %xmm6
7718 ; AVX2-FP-NEXT: vinsertf128 $1, 1984(%rdi), %ymm6, %ymm6
7719 ; AVX2-FP-NEXT: vmovaps 1792(%rdi), %xmm7
7720 ; AVX2-FP-NEXT: vinsertf128 $1, 1920(%rdi), %ymm7, %ymm7
7721 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2]
7722 ; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7723 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm9
7724 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm8
7725 ; AVX2-FP-NEXT: vinsertf128 $1, 192(%rdi), %ymm8, %ymm8
7726 ; AVX2-FP-NEXT: vinsertf128 $1, 128(%rdi), %ymm9, %ymm9
7727 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
7728 ; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7729 ; AVX2-FP-NEXT: vmovaps 576(%rdi), %xmm10
7730 ; AVX2-FP-NEXT: vinsertf128 $1, 704(%rdi), %ymm10, %ymm10
7731 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
7732 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7733 ; AVX2-FP-NEXT: vmovaps 512(%rdi), %xmm0
7734 ; AVX2-FP-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm0
7735 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
7736 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7737 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm10[0],ymm0[2],ymm10[2]
7738 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7739 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3]
7740 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7741 ; AVX2-FP-NEXT: vmovaps 1088(%rdi), %xmm1
7742 ; AVX2-FP-NEXT: vinsertf128 $1, 1216(%rdi), %ymm1, %ymm1
7743 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm7[1],ymm6[1],ymm7[3],ymm6[3]
7744 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7745 ; AVX2-FP-NEXT: vmovaps 1024(%rdi), %xmm2
7746 ; AVX2-FP-NEXT: vinsertf128 $1, 1152(%rdi), %ymm2, %ymm2
7747 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
7748 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7749 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
7750 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7751 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3]
7752 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7753 ; AVX2-FP-NEXT: vmovaps 1600(%rdi), %xmm0
7754 ; AVX2-FP-NEXT: vinsertf128 $1, 1728(%rdi), %ymm0, %ymm0
7755 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
7756 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7757 ; AVX2-FP-NEXT: vmovaps 1536(%rdi), %xmm1
7758 ; AVX2-FP-NEXT: vinsertf128 $1, 1664(%rdi), %ymm1, %ymm1
7759 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
7760 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7761 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
7762 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7763 ; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm2
7764 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7765 ; AVX2-FP-NEXT: vmovaps 256(%rdi), %ymm3
7766 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7767 ; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm1
7768 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7769 ; AVX2-FP-NEXT: vmovaps 384(%rdi), %ymm0
7770 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7771 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
7772 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
7773 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
7774 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7775 ; AVX2-FP-NEXT: vmovaps 832(%rdi), %ymm2
7776 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7777 ; AVX2-FP-NEXT: vmovaps 768(%rdi), %ymm3
7778 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7779 ; AVX2-FP-NEXT: vmovaps 960(%rdi), %ymm1
7780 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7781 ; AVX2-FP-NEXT: vmovaps 896(%rdi), %ymm0
7782 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7783 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
7784 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
7785 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
7786 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7787 ; AVX2-FP-NEXT: vmovaps 1344(%rdi), %ymm2
7788 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7789 ; AVX2-FP-NEXT: vmovaps 1280(%rdi), %ymm3
7790 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7791 ; AVX2-FP-NEXT: vmovaps 1472(%rdi), %ymm1
7792 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7793 ; AVX2-FP-NEXT: vmovaps 1408(%rdi), %ymm0
7794 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7795 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
7796 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
7797 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
7798 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7799 ; AVX2-FP-NEXT: vmovaps 1856(%rdi), %ymm2
7800 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7801 ; AVX2-FP-NEXT: vmovaps 1792(%rdi), %ymm3
7802 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7803 ; AVX2-FP-NEXT: vmovaps 1984(%rdi), %ymm1
7804 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7805 ; AVX2-FP-NEXT: vmovaps 1920(%rdi), %ymm0
7806 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7807 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
7808 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
7809 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
7810 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7811 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm2
7812 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7813 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm1
7814 ; AVX2-FP-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill
7815 ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm14
7816 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm12
7817 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm14[0],ymm12[2],ymm14[2]
7818 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
7819 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
7820 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7821 ; AVX2-FP-NEXT: vmovaps 576(%rdi), %ymm11
7822 ; AVX2-FP-NEXT: vmovaps 512(%rdi), %ymm10
7823 ; AVX2-FP-NEXT: vmovaps 704(%rdi), %ymm8
7824 ; AVX2-FP-NEXT: vmovaps 640(%rdi), %ymm7
7825 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2]
7826 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm10[0],ymm11[0],ymm10[2],ymm11[2]
7827 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
7828 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7829 ; AVX2-FP-NEXT: vmovaps 1088(%rdi), %ymm6
7830 ; AVX2-FP-NEXT: vmovaps 1024(%rdi), %ymm9
7831 ; AVX2-FP-NEXT: vmovaps 1216(%rdi), %ymm5
7832 ; AVX2-FP-NEXT: vmovaps 1152(%rdi), %ymm3
7833 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
7834 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm9[0],ymm6[0],ymm9[2],ymm6[2]
7835 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
7836 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7837 ; AVX2-FP-NEXT: vmovaps 1600(%rdi), %ymm4
7838 ; AVX2-FP-NEXT: vmovaps 1536(%rdi), %ymm13
7839 ; AVX2-FP-NEXT: vmovaps 1728(%rdi), %ymm2
7840 ; AVX2-FP-NEXT: vmovaps 1664(%rdi), %ymm1
7841 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
7842 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm13[0],ymm4[0],ymm13[2],ymm4[2]
7843 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
7844 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7845 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7846 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
7847 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
7848 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
7849 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
7850 ; AVX2-FP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
7851 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
7852 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7853 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7854 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
7855 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
7856 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
7857 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
7858 ; AVX2-FP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
7859 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
7860 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7861 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7862 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
7863 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
7864 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
7865 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
7866 ; AVX2-FP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
7867 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
7868 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7869 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7870 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
7871 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
7872 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
7873 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
7874 ; AVX2-FP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
7875 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
7876 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7877 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm14[1],ymm12[3],ymm14[3]
7878 ; AVX2-FP-NEXT: vmovups (%rsp), %ymm12 # 32-byte Reload
7879 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
7880 ; AVX2-FP-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3]
7881 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3]
7882 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7883 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3]
7884 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm10[1],ymm11[1],ymm10[3],ymm11[3]
7885 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm0[2,3]
7886 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7887 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
7888 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm6[1],ymm9[3],ymm6[3]
7889 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
7890 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7891 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
7892 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm13[1],ymm4[1],ymm13[3],ymm4[3]
7893 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
7894 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7895 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm0
7896 ; AVX2-FP-NEXT: vinsertf128 $1, 224(%rdi), %ymm0, %ymm0
7897 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm1
7898 ; AVX2-FP-NEXT: vinsertf128 $1, 160(%rdi), %ymm1, %ymm1
7899 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
7900 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7901 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
7902 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7903 ; AVX2-FP-NEXT: vmovaps 352(%rdi), %xmm0
7904 ; AVX2-FP-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm0
7905 ; AVX2-FP-NEXT: vmovaps 288(%rdi), %xmm1
7906 ; AVX2-FP-NEXT: vinsertf128 $1, 416(%rdi), %ymm1, %ymm1
7907 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
7908 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7909 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
7910 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7911 ; AVX2-FP-NEXT: vmovaps 608(%rdi), %xmm0
7912 ; AVX2-FP-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0
7913 ; AVX2-FP-NEXT: vmovaps 544(%rdi), %xmm1
7914 ; AVX2-FP-NEXT: vinsertf128 $1, 672(%rdi), %ymm1, %ymm1
7915 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
7916 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7917 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
7918 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7919 ; AVX2-FP-NEXT: vmovaps 864(%rdi), %xmm0
7920 ; AVX2-FP-NEXT: vinsertf128 $1, 992(%rdi), %ymm0, %ymm0
7921 ; AVX2-FP-NEXT: vmovaps 800(%rdi), %xmm1
7922 ; AVX2-FP-NEXT: vinsertf128 $1, 928(%rdi), %ymm1, %ymm1
7923 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
7924 ; AVX2-FP-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill
7925 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
7926 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7927 ; AVX2-FP-NEXT: vmovaps 1120(%rdi), %xmm0
7928 ; AVX2-FP-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm0
7929 ; AVX2-FP-NEXT: vmovaps 1056(%rdi), %xmm1
7930 ; AVX2-FP-NEXT: vinsertf128 $1, 1184(%rdi), %ymm1, %ymm1
7931 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
7932 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7933 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
7934 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7935 ; AVX2-FP-NEXT: vmovaps 1376(%rdi), %xmm0
7936 ; AVX2-FP-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0
7937 ; AVX2-FP-NEXT: vmovaps 1312(%rdi), %xmm1
7938 ; AVX2-FP-NEXT: vinsertf128 $1, 1440(%rdi), %ymm1, %ymm1
7939 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
7940 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7941 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
7942 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7943 ; AVX2-FP-NEXT: vmovaps 1632(%rdi), %xmm0
7944 ; AVX2-FP-NEXT: vinsertf128 $1, 1760(%rdi), %ymm0, %ymm0
7945 ; AVX2-FP-NEXT: vmovaps 1568(%rdi), %xmm1
7946 ; AVX2-FP-NEXT: vinsertf128 $1, 1696(%rdi), %ymm1, %ymm1
7947 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
7948 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7949 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
7950 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7951 ; AVX2-FP-NEXT: vmovaps 1888(%rdi), %xmm0
7952 ; AVX2-FP-NEXT: vinsertf128 $1, 2016(%rdi), %ymm0, %ymm0
7953 ; AVX2-FP-NEXT: vmovaps 1824(%rdi), %xmm1
7954 ; AVX2-FP-NEXT: vinsertf128 $1, 1952(%rdi), %ymm1, %ymm1
7955 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
7956 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7957 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
7958 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7959 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm2
7960 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7961 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm3
7962 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7963 ; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm1
7964 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7965 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm0
7966 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7967 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
7968 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
7969 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
7970 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7971 ; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm2
7972 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7973 ; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm3
7974 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7975 ; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm1
7976 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7977 ; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm0
7978 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7979 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
7980 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
7981 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
7982 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7983 ; AVX2-FP-NEXT: vmovaps 608(%rdi), %ymm2
7984 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7985 ; AVX2-FP-NEXT: vmovaps 544(%rdi), %ymm3
7986 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7987 ; AVX2-FP-NEXT: vmovaps 736(%rdi), %ymm1
7988 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7989 ; AVX2-FP-NEXT: vmovaps 672(%rdi), %ymm0
7990 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7991 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
7992 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
7993 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
7994 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7995 ; AVX2-FP-NEXT: vmovaps 864(%rdi), %ymm2
7996 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7997 ; AVX2-FP-NEXT: vmovaps 800(%rdi), %ymm3
7998 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7999 ; AVX2-FP-NEXT: vmovaps 992(%rdi), %ymm1
8000 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8001 ; AVX2-FP-NEXT: vmovaps 928(%rdi), %ymm0
8002 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8003 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
8004 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
8005 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
8006 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8007 ; AVX2-FP-NEXT: vmovaps 1120(%rdi), %ymm2
8008 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8009 ; AVX2-FP-NEXT: vmovaps 1056(%rdi), %ymm1
8010 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8011 ; AVX2-FP-NEXT: vmovaps 1248(%rdi), %ymm12
8012 ; AVX2-FP-NEXT: vmovaps 1184(%rdi), %ymm11
8013 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2]
8014 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
8015 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
8016 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8017 ; AVX2-FP-NEXT: vmovaps 1376(%rdi), %ymm10
8018 ; AVX2-FP-NEXT: vmovaps 1312(%rdi), %ymm9
8019 ; AVX2-FP-NEXT: vmovaps 1504(%rdi), %ymm8
8020 ; AVX2-FP-NEXT: vmovaps 1440(%rdi), %ymm7
8021 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2]
8022 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm9[0],ymm10[0],ymm9[2],ymm10[2]
8023 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
8024 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8025 ; AVX2-FP-NEXT: vmovaps 1632(%rdi), %ymm6
8026 ; AVX2-FP-NEXT: vmovaps 1568(%rdi), %ymm15
8027 ; AVX2-FP-NEXT: vmovaps 1760(%rdi), %ymm5
8028 ; AVX2-FP-NEXT: vmovaps 1696(%rdi), %ymm3
8029 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
8030 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm15[0],ymm6[0],ymm15[2],ymm6[2]
8031 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3]
8032 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8033 ; AVX2-FP-NEXT: vmovaps 1888(%rdi), %ymm4
8034 ; AVX2-FP-NEXT: vmovaps 1824(%rdi), %ymm14
8035 ; AVX2-FP-NEXT: vmovaps 2016(%rdi), %ymm2
8036 ; AVX2-FP-NEXT: vmovaps 1952(%rdi), %ymm1
8037 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
8038 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
8039 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
8040 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8041 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8042 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8043 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
8044 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
8045 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
8046 ; AVX2-FP-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
8047 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
8048 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8049 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8050 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8051 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
8052 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
8053 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
8054 ; AVX2-FP-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
8055 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
8056 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8057 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8058 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8059 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
8060 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
8061 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
8062 ; AVX2-FP-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
8063 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
8064 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8065 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8066 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8067 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
8068 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
8069 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
8070 ; AVX2-FP-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
8071 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm0[2,3]
8072 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm12[1],ymm11[3],ymm12[3]
8073 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
8074 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
8075 ; AVX2-FP-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
8076 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm0[2,3]
8077 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3]
8078 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm9[1],ymm10[1],ymm9[3],ymm10[3]
8079 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm0[2,3]
8080 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
8081 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm15[1],ymm6[1],ymm15[3],ymm6[3]
8082 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[2,3]
8083 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
8084 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
8085 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
8086 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8087 ; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rsi)
8088 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8089 ; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rsi)
8090 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8091 ; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rsi)
8092 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8093 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rsi)
8094 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8095 ; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rsi)
8096 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8097 ; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rsi)
8098 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8099 ; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rsi)
8100 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8101 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rsi)
8102 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8103 ; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rdx)
8104 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8105 ; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rdx)
8106 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8107 ; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rdx)
8108 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8109 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rdx)
8110 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8111 ; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rdx)
8112 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8113 ; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rdx)
8114 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8115 ; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rdx)
8116 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8117 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rdx)
8118 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8119 ; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rcx)
8120 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8121 ; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rcx)
8122 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8123 ; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rcx)
8124 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8125 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rcx)
8126 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8127 ; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rcx)
8128 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8129 ; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rcx)
8130 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8131 ; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rcx)
8132 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8133 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rcx)
8134 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8135 ; AVX2-FP-NEXT: vmovaps %ymm1, 192(%r8)
8136 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8137 ; AVX2-FP-NEXT: vmovaps %ymm1, 128(%r8)
8138 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8139 ; AVX2-FP-NEXT: vmovaps %ymm1, 64(%r8)
8140 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8141 ; AVX2-FP-NEXT: vmovaps %ymm1, (%r8)
8142 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8143 ; AVX2-FP-NEXT: vmovaps %ymm1, 224(%r8)
8144 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8145 ; AVX2-FP-NEXT: vmovaps %ymm1, 160(%r8)
8146 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8147 ; AVX2-FP-NEXT: vmovaps %ymm1, 96(%r8)
8148 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8149 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%r8)
8150 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8151 ; AVX2-FP-NEXT: vmovaps %ymm1, 224(%r9)
8152 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8153 ; AVX2-FP-NEXT: vmovaps %ymm1, 192(%r9)
8154 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8155 ; AVX2-FP-NEXT: vmovaps %ymm1, 160(%r9)
8156 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8157 ; AVX2-FP-NEXT: vmovaps %ymm1, 128(%r9)
8158 ; AVX2-FP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
8159 ; AVX2-FP-NEXT: vmovaps %ymm1, 96(%r9)
8160 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8161 ; AVX2-FP-NEXT: vmovaps %ymm1, 64(%r9)
8162 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8163 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%r9)
8164 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8165 ; AVX2-FP-NEXT: vmovaps %ymm1, (%r9)
8166 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
8167 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8168 ; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rax)
8169 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8170 ; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rax)
8171 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8172 ; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rax)
8173 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8174 ; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rax)
8175 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8176 ; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rax)
8177 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8178 ; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rax)
8179 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8180 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rax)
8181 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8182 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rax)
8183 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
8184 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8185 ; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rax)
8186 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8187 ; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rax)
8188 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8189 ; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rax)
8190 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8191 ; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rax)
8192 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8193 ; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rax)
8194 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8195 ; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rax)
8196 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8197 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rax)
8198 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8199 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rax)
8200 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
8201 ; AVX2-FP-NEXT: vmovaps %ymm0, 224(%rax)
8202 ; AVX2-FP-NEXT: vmovaps %ymm3, 192(%rax)
8203 ; AVX2-FP-NEXT: vmovaps %ymm7, 160(%rax)
8204 ; AVX2-FP-NEXT: vmovaps %ymm11, 128(%rax)
8205 ; AVX2-FP-NEXT: vmovaps %ymm13, 96(%rax)
8206 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8207 ; AVX2-FP-NEXT: vmovaps %ymm0, 64(%rax)
8208 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8209 ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rax)
8210 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8211 ; AVX2-FP-NEXT: vmovaps %ymm0, (%rax)
8212 ; AVX2-FP-NEXT: addq $2248, %rsp # imm = 0x8C8
8213 ; AVX2-FP-NEXT: vzeroupper
8214 ; AVX2-FP-NEXT: retq
8216 ; AVX2-FCP-LABEL: load_i64_stride8_vf32:
8217 ; AVX2-FCP: # %bb.0:
8218 ; AVX2-FCP-NEXT: subq $2248, %rsp # imm = 0x8C8
8219 ; AVX2-FCP-NEXT: vmovaps 320(%rdi), %xmm0
8220 ; AVX2-FCP-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0
8221 ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %xmm1
8222 ; AVX2-FCP-NEXT: vinsertf128 $1, 384(%rdi), %ymm1, %ymm1
8223 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
8224 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8225 ; AVX2-FCP-NEXT: vmovaps 832(%rdi), %xmm2
8226 ; AVX2-FCP-NEXT: vinsertf128 $1, 960(%rdi), %ymm2, %ymm2
8227 ; AVX2-FCP-NEXT: vmovaps 768(%rdi), %xmm3
8228 ; AVX2-FCP-NEXT: vinsertf128 $1, 896(%rdi), %ymm3, %ymm3
8229 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
8230 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8231 ; AVX2-FCP-NEXT: vmovaps 1344(%rdi), %xmm4
8232 ; AVX2-FCP-NEXT: vinsertf128 $1, 1472(%rdi), %ymm4, %ymm4
8233 ; AVX2-FCP-NEXT: vmovaps 1280(%rdi), %xmm5
8234 ; AVX2-FCP-NEXT: vinsertf128 $1, 1408(%rdi), %ymm5, %ymm5
8235 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
8236 ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8237 ; AVX2-FCP-NEXT: vmovaps 1856(%rdi), %xmm6
8238 ; AVX2-FCP-NEXT: vinsertf128 $1, 1984(%rdi), %ymm6, %ymm6
8239 ; AVX2-FCP-NEXT: vmovaps 1792(%rdi), %xmm7
8240 ; AVX2-FCP-NEXT: vinsertf128 $1, 1920(%rdi), %ymm7, %ymm7
8241 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2]
8242 ; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8243 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm9
8244 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm8
8245 ; AVX2-FCP-NEXT: vinsertf128 $1, 192(%rdi), %ymm8, %ymm8
8246 ; AVX2-FCP-NEXT: vinsertf128 $1, 128(%rdi), %ymm9, %ymm9
8247 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
8248 ; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8249 ; AVX2-FCP-NEXT: vmovaps 576(%rdi), %xmm10
8250 ; AVX2-FCP-NEXT: vinsertf128 $1, 704(%rdi), %ymm10, %ymm10
8251 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
8252 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8253 ; AVX2-FCP-NEXT: vmovaps 512(%rdi), %xmm0
8254 ; AVX2-FCP-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm0
8255 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
8256 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8257 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm10[0],ymm0[2],ymm10[2]
8258 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8259 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3]
8260 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8261 ; AVX2-FCP-NEXT: vmovaps 1088(%rdi), %xmm1
8262 ; AVX2-FCP-NEXT: vinsertf128 $1, 1216(%rdi), %ymm1, %ymm1
8263 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm7[1],ymm6[1],ymm7[3],ymm6[3]
8264 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8265 ; AVX2-FCP-NEXT: vmovaps 1024(%rdi), %xmm2
8266 ; AVX2-FCP-NEXT: vinsertf128 $1, 1152(%rdi), %ymm2, %ymm2
8267 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
8268 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8269 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
8270 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8271 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3]
8272 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8273 ; AVX2-FCP-NEXT: vmovaps 1600(%rdi), %xmm0
8274 ; AVX2-FCP-NEXT: vinsertf128 $1, 1728(%rdi), %ymm0, %ymm0
8275 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
8276 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8277 ; AVX2-FCP-NEXT: vmovaps 1536(%rdi), %xmm1
8278 ; AVX2-FCP-NEXT: vinsertf128 $1, 1664(%rdi), %ymm1, %ymm1
8279 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
8280 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8281 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
8282 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8283 ; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm2
8284 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8285 ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm3
8286 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8287 ; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm1
8288 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8289 ; AVX2-FCP-NEXT: vmovaps 384(%rdi), %ymm0
8290 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8291 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
8292 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
8293 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
8294 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8295 ; AVX2-FCP-NEXT: vmovaps 832(%rdi), %ymm2
8296 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8297 ; AVX2-FCP-NEXT: vmovaps 768(%rdi), %ymm3
8298 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8299 ; AVX2-FCP-NEXT: vmovaps 960(%rdi), %ymm1
8300 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8301 ; AVX2-FCP-NEXT: vmovaps 896(%rdi), %ymm0
8302 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8303 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
8304 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
8305 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
8306 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8307 ; AVX2-FCP-NEXT: vmovaps 1344(%rdi), %ymm2
8308 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8309 ; AVX2-FCP-NEXT: vmovaps 1280(%rdi), %ymm3
8310 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8311 ; AVX2-FCP-NEXT: vmovaps 1472(%rdi), %ymm1
8312 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8313 ; AVX2-FCP-NEXT: vmovaps 1408(%rdi), %ymm0
8314 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8315 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
8316 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
8317 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
8318 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8319 ; AVX2-FCP-NEXT: vmovaps 1856(%rdi), %ymm2
8320 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8321 ; AVX2-FCP-NEXT: vmovaps 1792(%rdi), %ymm3
8322 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8323 ; AVX2-FCP-NEXT: vmovaps 1984(%rdi), %ymm1
8324 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8325 ; AVX2-FCP-NEXT: vmovaps 1920(%rdi), %ymm0
8326 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8327 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
8328 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
8329 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
8330 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8331 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm2
8332 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8333 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm1
8334 ; AVX2-FCP-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill
8335 ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm14
8336 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm12
8337 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm14[0],ymm12[2],ymm14[2]
8338 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
8339 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
8340 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8341 ; AVX2-FCP-NEXT: vmovaps 576(%rdi), %ymm11
8342 ; AVX2-FCP-NEXT: vmovaps 512(%rdi), %ymm10
8343 ; AVX2-FCP-NEXT: vmovaps 704(%rdi), %ymm8
8344 ; AVX2-FCP-NEXT: vmovaps 640(%rdi), %ymm7
8345 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2]
8346 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm10[0],ymm11[0],ymm10[2],ymm11[2]
8347 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
8348 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8349 ; AVX2-FCP-NEXT: vmovaps 1088(%rdi), %ymm6
8350 ; AVX2-FCP-NEXT: vmovaps 1024(%rdi), %ymm9
8351 ; AVX2-FCP-NEXT: vmovaps 1216(%rdi), %ymm5
8352 ; AVX2-FCP-NEXT: vmovaps 1152(%rdi), %ymm3
8353 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
8354 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm9[0],ymm6[0],ymm9[2],ymm6[2]
8355 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
8356 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8357 ; AVX2-FCP-NEXT: vmovaps 1600(%rdi), %ymm4
8358 ; AVX2-FCP-NEXT: vmovaps 1536(%rdi), %ymm13
8359 ; AVX2-FCP-NEXT: vmovaps 1728(%rdi), %ymm2
8360 ; AVX2-FCP-NEXT: vmovaps 1664(%rdi), %ymm1
8361 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
8362 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm13[0],ymm4[0],ymm13[2],ymm4[2]
8363 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
8364 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8365 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8366 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8367 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
8368 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
8369 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
8370 ; AVX2-FCP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
8371 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
8372 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8373 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8374 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8375 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
8376 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
8377 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
8378 ; AVX2-FCP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
8379 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
8380 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8381 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8382 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8383 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
8384 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
8385 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
8386 ; AVX2-FCP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
8387 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
8388 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8389 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8390 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8391 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
8392 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
8393 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
8394 ; AVX2-FCP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
8395 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
8396 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8397 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm14[1],ymm12[3],ymm14[3]
8398 ; AVX2-FCP-NEXT: vmovups (%rsp), %ymm12 # 32-byte Reload
8399 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
8400 ; AVX2-FCP-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3]
8401 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3]
8402 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8403 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3]
8404 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm10[1],ymm11[1],ymm10[3],ymm11[3]
8405 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm0[2,3]
8406 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8407 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
8408 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm6[1],ymm9[3],ymm6[3]
8409 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
8410 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8411 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
8412 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm13[1],ymm4[1],ymm13[3],ymm4[3]
8413 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
8414 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8415 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm0
8416 ; AVX2-FCP-NEXT: vinsertf128 $1, 224(%rdi), %ymm0, %ymm0
8417 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm1
8418 ; AVX2-FCP-NEXT: vinsertf128 $1, 160(%rdi), %ymm1, %ymm1
8419 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
8420 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8421 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
8422 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8423 ; AVX2-FCP-NEXT: vmovaps 352(%rdi), %xmm0
8424 ; AVX2-FCP-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm0
8425 ; AVX2-FCP-NEXT: vmovaps 288(%rdi), %xmm1
8426 ; AVX2-FCP-NEXT: vinsertf128 $1, 416(%rdi), %ymm1, %ymm1
8427 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
8428 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8429 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
8430 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8431 ; AVX2-FCP-NEXT: vmovaps 608(%rdi), %xmm0
8432 ; AVX2-FCP-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0
8433 ; AVX2-FCP-NEXT: vmovaps 544(%rdi), %xmm1
8434 ; AVX2-FCP-NEXT: vinsertf128 $1, 672(%rdi), %ymm1, %ymm1
8435 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
8436 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8437 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
8438 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8439 ; AVX2-FCP-NEXT: vmovaps 864(%rdi), %xmm0
8440 ; AVX2-FCP-NEXT: vinsertf128 $1, 992(%rdi), %ymm0, %ymm0
8441 ; AVX2-FCP-NEXT: vmovaps 800(%rdi), %xmm1
8442 ; AVX2-FCP-NEXT: vinsertf128 $1, 928(%rdi), %ymm1, %ymm1
8443 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
8444 ; AVX2-FCP-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill
8445 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
8446 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8447 ; AVX2-FCP-NEXT: vmovaps 1120(%rdi), %xmm0
8448 ; AVX2-FCP-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm0
8449 ; AVX2-FCP-NEXT: vmovaps 1056(%rdi), %xmm1
8450 ; AVX2-FCP-NEXT: vinsertf128 $1, 1184(%rdi), %ymm1, %ymm1
8451 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
8452 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8453 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
8454 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8455 ; AVX2-FCP-NEXT: vmovaps 1376(%rdi), %xmm0
8456 ; AVX2-FCP-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0
8457 ; AVX2-FCP-NEXT: vmovaps 1312(%rdi), %xmm1
8458 ; AVX2-FCP-NEXT: vinsertf128 $1, 1440(%rdi), %ymm1, %ymm1
8459 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
8460 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8461 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
8462 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8463 ; AVX2-FCP-NEXT: vmovaps 1632(%rdi), %xmm0
8464 ; AVX2-FCP-NEXT: vinsertf128 $1, 1760(%rdi), %ymm0, %ymm0
8465 ; AVX2-FCP-NEXT: vmovaps 1568(%rdi), %xmm1
8466 ; AVX2-FCP-NEXT: vinsertf128 $1, 1696(%rdi), %ymm1, %ymm1
8467 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
8468 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8469 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
8470 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8471 ; AVX2-FCP-NEXT: vmovaps 1888(%rdi), %xmm0
8472 ; AVX2-FCP-NEXT: vinsertf128 $1, 2016(%rdi), %ymm0, %ymm0
8473 ; AVX2-FCP-NEXT: vmovaps 1824(%rdi), %xmm1
8474 ; AVX2-FCP-NEXT: vinsertf128 $1, 1952(%rdi), %ymm1, %ymm1
8475 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
8476 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8477 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
8478 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8479 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm2
8480 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8481 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm3
8482 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8483 ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm1
8484 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8485 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm0
8486 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8487 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
8488 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
8489 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
8490 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8491 ; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm2
8492 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8493 ; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm3
8494 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8495 ; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm1
8496 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8497 ; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm0
8498 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8499 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
8500 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
8501 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
8502 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8503 ; AVX2-FCP-NEXT: vmovaps 608(%rdi), %ymm2
8504 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8505 ; AVX2-FCP-NEXT: vmovaps 544(%rdi), %ymm3
8506 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8507 ; AVX2-FCP-NEXT: vmovaps 736(%rdi), %ymm1
8508 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8509 ; AVX2-FCP-NEXT: vmovaps 672(%rdi), %ymm0
8510 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8511 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
8512 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
8513 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
8514 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8515 ; AVX2-FCP-NEXT: vmovaps 864(%rdi), %ymm2
8516 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8517 ; AVX2-FCP-NEXT: vmovaps 800(%rdi), %ymm3
8518 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8519 ; AVX2-FCP-NEXT: vmovaps 992(%rdi), %ymm1
8520 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8521 ; AVX2-FCP-NEXT: vmovaps 928(%rdi), %ymm0
8522 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8523 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
8524 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
8525 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
8526 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8527 ; AVX2-FCP-NEXT: vmovaps 1120(%rdi), %ymm2
8528 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8529 ; AVX2-FCP-NEXT: vmovaps 1056(%rdi), %ymm1
8530 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8531 ; AVX2-FCP-NEXT: vmovaps 1248(%rdi), %ymm12
8532 ; AVX2-FCP-NEXT: vmovaps 1184(%rdi), %ymm11
8533 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2]
8534 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
8535 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
8536 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8537 ; AVX2-FCP-NEXT: vmovaps 1376(%rdi), %ymm10
8538 ; AVX2-FCP-NEXT: vmovaps 1312(%rdi), %ymm9
8539 ; AVX2-FCP-NEXT: vmovaps 1504(%rdi), %ymm8
8540 ; AVX2-FCP-NEXT: vmovaps 1440(%rdi), %ymm7
8541 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2]
8542 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm9[0],ymm10[0],ymm9[2],ymm10[2]
8543 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
8544 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8545 ; AVX2-FCP-NEXT: vmovaps 1632(%rdi), %ymm6
8546 ; AVX2-FCP-NEXT: vmovaps 1568(%rdi), %ymm15
8547 ; AVX2-FCP-NEXT: vmovaps 1760(%rdi), %ymm5
8548 ; AVX2-FCP-NEXT: vmovaps 1696(%rdi), %ymm3
8549 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
8550 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm15[0],ymm6[0],ymm15[2],ymm6[2]
8551 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3]
8552 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8553 ; AVX2-FCP-NEXT: vmovaps 1888(%rdi), %ymm4
8554 ; AVX2-FCP-NEXT: vmovaps 1824(%rdi), %ymm14
8555 ; AVX2-FCP-NEXT: vmovaps 2016(%rdi), %ymm2
8556 ; AVX2-FCP-NEXT: vmovaps 1952(%rdi), %ymm1
8557 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
8558 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
8559 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
8560 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8561 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8562 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8563 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
8564 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
8565 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
8566 ; AVX2-FCP-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
8567 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
8568 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8569 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8570 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8571 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
8572 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
8573 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
8574 ; AVX2-FCP-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
8575 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
8576 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8577 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8578 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8579 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
8580 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
8581 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
8582 ; AVX2-FCP-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
8583 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
8584 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8585 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8586 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8587 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
8588 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
8589 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
8590 ; AVX2-FCP-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
8591 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm0[2,3]
8592 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm12[1],ymm11[3],ymm12[3]
8593 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
8594 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
8595 ; AVX2-FCP-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
8596 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm0[2,3]
8597 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3]
8598 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm9[1],ymm10[1],ymm9[3],ymm10[3]
8599 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm0[2,3]
8600 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
8601 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm15[1],ymm6[1],ymm15[3],ymm6[3]
8602 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[2,3]
8603 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
8604 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
8605 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
8606 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8607 ; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rsi)
8608 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8609 ; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rsi)
8610 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8611 ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rsi)
8612 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8613 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rsi)
8614 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8615 ; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rsi)
8616 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8617 ; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rsi)
8618 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8619 ; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rsi)
8620 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8621 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rsi)
8622 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8623 ; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rdx)
8624 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8625 ; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rdx)
8626 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8627 ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rdx)
8628 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8629 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rdx)
8630 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8631 ; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rdx)
8632 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8633 ; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rdx)
8634 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8635 ; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rdx)
8636 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8637 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rdx)
8638 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8639 ; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rcx)
8640 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8641 ; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rcx)
8642 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8643 ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rcx)
8644 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8645 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rcx)
8646 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8647 ; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rcx)
8648 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8649 ; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rcx)
8650 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8651 ; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rcx)
8652 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8653 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rcx)
8654 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8655 ; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%r8)
8656 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8657 ; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%r8)
8658 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8659 ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%r8)
8660 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8661 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%r8)
8662 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8663 ; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%r8)
8664 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8665 ; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%r8)
8666 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8667 ; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%r8)
8668 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8669 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%r8)
8670 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8671 ; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%r9)
8672 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8673 ; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%r9)
8674 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8675 ; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%r9)
8676 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8677 ; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%r9)
8678 ; AVX2-FCP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
8679 ; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%r9)
8680 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8681 ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%r9)
8682 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8683 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%r9)
8684 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8685 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%r9)
8686 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
8687 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8688 ; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rax)
8689 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8690 ; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rax)
8691 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8692 ; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rax)
8693 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8694 ; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rax)
8695 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8696 ; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rax)
8697 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8698 ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rax)
8699 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8700 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rax)
8701 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8702 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rax)
8703 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
8704 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8705 ; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rax)
8706 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8707 ; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rax)
8708 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8709 ; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rax)
8710 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8711 ; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rax)
8712 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8713 ; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rax)
8714 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8715 ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rax)
8716 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8717 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rax)
8718 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8719 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rax)
8720 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
8721 ; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%rax)
8722 ; AVX2-FCP-NEXT: vmovaps %ymm3, 192(%rax)
8723 ; AVX2-FCP-NEXT: vmovaps %ymm7, 160(%rax)
8724 ; AVX2-FCP-NEXT: vmovaps %ymm11, 128(%rax)
8725 ; AVX2-FCP-NEXT: vmovaps %ymm13, 96(%rax)
8726 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8727 ; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rax)
8728 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8729 ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rax)
8730 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8731 ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax)
8732 ; AVX2-FCP-NEXT: addq $2248, %rsp # imm = 0x8C8
8733 ; AVX2-FCP-NEXT: vzeroupper
8734 ; AVX2-FCP-NEXT: retq
8736 ; AVX512-LABEL: load_i64_stride8_vf32:
8738 ; AVX512-NEXT: subq $3208, %rsp # imm = 0xC88
8739 ; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm12
8740 ; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm16
8741 ; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm19
8742 ; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm18
8743 ; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm24
8744 ; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm27
8745 ; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm7
8746 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8747 ; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm11
8748 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8749 ; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm31
8750 ; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm3
8751 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm20
8752 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm8
8753 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8754 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm25
8755 ; AVX512-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8756 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm23
8757 ; AVX512-NEXT: movb $-64, %al
8758 ; AVX512-NEXT: kmovw %eax, %k1
8759 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10]
8760 ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8761 ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm1
8762 ; AVX512-NEXT: vpermt2q %zmm24, %zmm0, %zmm1
8763 ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm2
8764 ; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8765 ; AVX512-NEXT: vpermt2q %zmm19, %zmm0, %zmm2
8766 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
8767 ; AVX512-NEXT: vmovdqa64 1216(%rdi), %ymm22
8768 ; AVX512-NEXT: vmovdqa 1152(%rdi), %ymm10
8769 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm10[0],ymm22[0],ymm10[2],ymm22[2]
8770 ; AVX512-NEXT: vmovdqa 1088(%rdi), %ymm9
8771 ; AVX512-NEXT: vmovdqa 1024(%rdi), %ymm6
8772 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2]
8773 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3]
8774 ; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1
8775 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8776 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2
8777 ; AVX512-NEXT: vpermt2q %zmm31, %zmm0, %zmm2
8778 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm4
8779 ; AVX512-NEXT: vpermt2q %zmm7, %zmm0, %zmm4
8780 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1}
8781 ; AVX512-NEXT: vmovdqa 704(%rdi), %ymm7
8782 ; AVX512-NEXT: vmovdqa 640(%rdi), %ymm13
8783 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm13[0],ymm7[0],ymm13[2],ymm7[2]
8784 ; AVX512-NEXT: vmovdqa64 576(%rdi), %ymm17
8785 ; AVX512-NEXT: vmovdqa64 512(%rdi), %ymm21
8786 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2]
8787 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3]
8788 ; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm1
8789 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8790 ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm4
8791 ; AVX512-NEXT: vpermt2q %zmm25, %zmm0, %zmm4
8792 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm11
8793 ; AVX512-NEXT: vpermt2q %zmm20, %zmm0, %zmm11
8794 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1}
8795 ; AVX512-NEXT: vmovdqa 192(%rdi), %ymm4
8796 ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm14
8797 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
8798 ; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm25
8799 ; AVX512-NEXT: vmovdqa64 (%rdi), %ymm28
8800 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm25[0],ymm28[2],ymm25[2]
8801 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
8802 ; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm29
8803 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1
8804 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8805 ; AVX512-NEXT: vmovdqa64 %zmm29, %zmm1
8806 ; AVX512-NEXT: vpermt2q %zmm16, %zmm0, %zmm1
8807 ; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm26
8808 ; AVX512-NEXT: vpermi2q %zmm12, %zmm26, %zmm0
8809 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
8810 ; AVX512-NEXT: vmovdqa 1728(%rdi), %ymm2
8811 ; AVX512-NEXT: vmovdqa 1664(%rdi), %ymm8
8812 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm2[0],ymm8[2],ymm2[2]
8813 ; AVX512-NEXT: vmovdqa64 1600(%rdi), %ymm30
8814 ; AVX512-NEXT: vmovdqa 1536(%rdi), %ymm1
8815 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm30[0],ymm1[2],ymm30[2]
8816 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3]
8817 ; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0
8818 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8819 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
8820 ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8821 ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm11
8822 ; AVX512-NEXT: vpermt2q %zmm24, %zmm0, %zmm11
8823 ; AVX512-NEXT: vpermt2q %zmm19, %zmm0, %zmm18
8824 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1}
8825 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3]
8826 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3]
8827 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3]
8828 ; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5
8829 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8830 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5
8831 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10
8832 ; AVX512-NEXT: vpermt2q %zmm31, %zmm0, %zmm5
8833 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
8834 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm6
8835 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
8836 ; AVX512-NEXT: vpermt2q %zmm22, %zmm0, %zmm6
8837 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
8838 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm13[1],ymm7[1],ymm13[3],ymm7[3]
8839 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm21[1],ymm17[1],ymm21[3],ymm17[3]
8840 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3]
8841 ; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3
8842 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8843 ; AVX512-NEXT: vmovdqu64 %zmm23, (%rsp) # 64-byte Spill
8844 ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm3
8845 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
8846 ; AVX512-NEXT: vpermt2q %zmm18, %zmm0, %zmm3
8847 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
8848 ; AVX512-NEXT: vmovdqa64 %zmm21, %zmm5
8849 ; AVX512-NEXT: vpermt2q %zmm20, %zmm0, %zmm5
8850 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1}
8851 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
8852 ; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm7
8853 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3]
8854 ; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm6
8855 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
8856 ; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm13
8857 ; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3
8858 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8859 ; AVX512-NEXT: vmovdqa64 %zmm29, %zmm15
8860 ; AVX512-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8861 ; AVX512-NEXT: vmovdqa64 %zmm29, %zmm3
8862 ; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8863 ; AVX512-NEXT: vpermt2q %zmm16, %zmm0, %zmm3
8864 ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm9
8865 ; AVX512-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8866 ; AVX512-NEXT: vpermi2q %zmm12, %zmm26, %zmm0
8867 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1}
8868 ; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm4
8869 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3]
8870 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3]
8871 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
8872 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
8873 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8874 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
8875 ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8876 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1
8877 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8878 ; AVX512-NEXT: vpermt2q %zmm13, %zmm0, %zmm1
8879 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm8
8880 ; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8881 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm2
8882 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm5
8883 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8884 ; AVX512-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
8885 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8886 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
8887 ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm2
8888 ; AVX512-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
8889 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
8890 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6]
8891 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
8892 ; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm25
8893 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
8894 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8895 ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm1
8896 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8897 ; AVX512-NEXT: vpermt2q %zmm25, %zmm0, %zmm1
8898 ; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm26
8899 ; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm14
8900 ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm2
8901 ; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8902 ; AVX512-NEXT: vpermt2q %zmm26, %zmm0, %zmm2
8903 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
8904 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm2
8905 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm29
8906 ; AVX512-NEXT: vpermt2q %zmm31, %zmm0, %zmm2
8907 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6]
8908 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm10
8909 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
8910 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
8911 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8912 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm2
8913 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8914 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm1
8915 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8916 ; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
8917 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm6
8918 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8919 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2
8920 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8921 ; AVX512-NEXT: vpermt2q %zmm6, %zmm0, %zmm2
8922 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
8923 ; AVX512-NEXT: vpermt2q %zmm18, %zmm0, %zmm23
8924 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6]
8925 ; AVX512-NEXT: vmovdqa64 %zmm21, %zmm22
8926 ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm3 {%k1}
8927 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
8928 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8929 ; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm18
8930 ; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm1
8931 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8932 ; AVX512-NEXT: vpermt2q %zmm18, %zmm0, %zmm1
8933 ; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm13
8934 ; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8935 ; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm6
8936 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm2
8937 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8938 ; AVX512-NEXT: vpermt2q %zmm13, %zmm0, %zmm2
8939 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
8940 ; AVX512-NEXT: vpermi2q %zmm16, %zmm15, %zmm0
8941 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm9[0],zmm12[0],zmm9[2],zmm12[2],zmm9[4],zmm12[4],zmm9[6],zmm12[6]
8942 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
8943 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0
8944 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8945 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,13,5,13,5,13,5,13]
8946 ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8947 ; AVX512-NEXT: vpermt2q %zmm8, %zmm2, %zmm4
8948 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1
8949 ; AVX512-NEXT: vpermt2q %zmm7, %zmm2, %zmm1
8950 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7]
8951 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [0,8,0,8,0,8,0,8]
8952 ; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8953 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm9
8954 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm0
8955 ; AVX512-NEXT: vpermt2q %zmm19, %zmm21, %zmm0
8956 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8957 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [1,9,1,9,1,9,1,9]
8958 ; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8959 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm0
8960 ; AVX512-NEXT: vpermt2q %zmm19, %zmm16, %zmm0
8961 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8962 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14]
8963 ; AVX512-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8964 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm0
8965 ; AVX512-NEXT: vpermt2q %zmm19, %zmm23, %zmm0
8966 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
8967 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15]
8968 ; AVX512-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8969 ; AVX512-NEXT: vpermt2q %zmm19, %zmm17, %zmm9
8970 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm27[0],zmm24[0],zmm27[2],zmm24[2],zmm27[4],zmm24[4],zmm27[6],zmm24[6]
8971 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8972 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm27[1],zmm24[1],zmm27[3],zmm24[3],zmm27[5],zmm24[5],zmm27[7],zmm24[7]
8973 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8974 ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm0
8975 ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm8
8976 ; AVX512-NEXT: vpermt2q %zmm24, %zmm21, %zmm27
8977 ; AVX512-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8978 ; AVX512-NEXT: vpermt2q %zmm24, %zmm16, %zmm0
8979 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8980 ; AVX512-NEXT: vpermt2q %zmm24, %zmm2, %zmm8
8981 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
8982 ; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm0
8983 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8984 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
8985 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7
8986 ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm9
8987 ; AVX512-NEXT: vpermt2q %zmm25, %zmm2, %zmm7
8988 ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm13
8989 ; AVX512-NEXT: vpermt2q %zmm26, %zmm2, %zmm14
8990 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm7[4,5,6,7]
8991 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0
8992 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm4
8993 ; AVX512-NEXT: vpermt2q %zmm10, %zmm21, %zmm4
8994 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8995 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm4
8996 ; AVX512-NEXT: vpermt2q %zmm10, %zmm16, %zmm4
8997 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8998 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm30
8999 ; AVX512-NEXT: vpermt2q %zmm10, %zmm23, %zmm30
9000 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7]
9001 ; AVX512-NEXT: vpermt2q %zmm10, %zmm17, %zmm0
9002 ; AVX512-NEXT: vmovdqa64 %zmm29, %zmm4
9003 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6]
9004 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7]
9005 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9006 ; AVX512-NEXT: vmovdqa64 %zmm29, %zmm0
9007 ; AVX512-NEXT: vmovdqa64 %zmm29, %zmm14
9008 ; AVX512-NEXT: vpermt2q %zmm31, %zmm21, %zmm4
9009 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9010 ; AVX512-NEXT: vpermt2q %zmm31, %zmm16, %zmm0
9011 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9012 ; AVX512-NEXT: vpermt2q %zmm31, %zmm2, %zmm14
9013 ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1}
9014 ; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm0
9015 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9016 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
9017 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
9018 ; AVX512-NEXT: vpermt2q %zmm28, %zmm2, %zmm8
9019 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
9020 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm11
9021 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm11 # 64-byte Folded Reload
9022 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4,5,6,7]
9023 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm0
9024 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm25
9025 ; AVX512-NEXT: vpermt2q %zmm20, %zmm21, %zmm25
9026 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm5
9027 ; AVX512-NEXT: vpermt2q %zmm20, %zmm16, %zmm5
9028 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9029 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm27
9030 ; AVX512-NEXT: vpermt2q %zmm20, %zmm23, %zmm27
9031 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm20[1],zmm22[3],zmm20[3],zmm22[5],zmm20[5],zmm22[7],zmm20[7]
9032 ; AVX512-NEXT: vpermt2q %zmm20, %zmm17, %zmm0
9033 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9034 ; AVX512-NEXT: vmovdqu64 (%rsp), %zmm31 # 64-byte Reload
9035 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6]
9036 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm31[1],zmm1[1],zmm31[3],zmm1[3],zmm31[5],zmm1[5],zmm31[7],zmm1[7]
9037 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9038 ; AVX512-NEXT: vmovdqa64 %zmm31, %zmm0
9039 ; AVX512-NEXT: vmovdqa64 %zmm31, %zmm24
9040 ; AVX512-NEXT: vpermt2q %zmm1, %zmm21, %zmm31
9041 ; AVX512-NEXT: vpermt2q %zmm1, %zmm16, %zmm0
9042 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9043 ; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm24
9044 ; AVX512-NEXT: vmovdqa64 %zmm24, %zmm14 {%k1}
9045 ; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm0
9046 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9047 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
9048 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm11
9049 ; AVX512-NEXT: vpermt2q %zmm18, %zmm2, %zmm11
9050 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
9051 ; AVX512-NEXT: vpermt2q %zmm19, %zmm2, %zmm6
9052 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm11[4,5,6,7]
9053 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
9054 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
9055 ; AVX512-NEXT: vpermi2q %zmm20, %zmm5, %zmm2
9056 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
9057 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7]
9058 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1}
9059 ; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm2
9060 ; AVX512-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
9061 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm2
9062 ; AVX512-NEXT: vpermt2q %zmm20, %zmm21, %zmm2
9063 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9064 ; AVX512-NEXT: vpermi2q %zmm12, %zmm22, %zmm21
9065 ; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9066 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm2
9067 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm21
9068 ; AVX512-NEXT: vpermt2q %zmm20, %zmm16, %zmm2
9069 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9070 ; AVX512-NEXT: vpermi2q %zmm12, %zmm22, %zmm16
9071 ; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9072 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm16
9073 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
9074 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm14
9075 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
9076 ; AVX512-NEXT: vpermt2q %zmm24, %zmm23, %zmm14
9077 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
9078 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
9079 ; AVX512-NEXT: vpermt2q %zmm26, %zmm23, %zmm11
9080 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12
9081 ; AVX512-NEXT: vpermt2q %zmm9, %zmm23, %zmm12
9082 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm29
9083 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
9084 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8
9085 ; AVX512-NEXT: vpermt2q %zmm13, %zmm23, %zmm8
9086 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
9087 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm10
9088 ; AVX512-NEXT: vpermt2q %zmm28, %zmm23, %zmm10
9089 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5
9090 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7
9091 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
9092 ; AVX512-NEXT: vpermt2q %zmm4, %zmm23, %zmm7
9093 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm0
9094 ; AVX512-NEXT: vpermt2q %zmm18, %zmm23, %zmm0
9095 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9096 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9097 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0
9098 ; AVX512-NEXT: vpermt2q %zmm19, %zmm23, %zmm0
9099 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9100 ; AVX512-NEXT: vpermi2q %zmm16, %zmm22, %zmm23
9101 ; AVX512-NEXT: vpermt2q %zmm16, %zmm17, %zmm22
9102 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6]
9103 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm21[1],zmm20[1],zmm21[3],zmm20[3],zmm21[5],zmm20[5],zmm21[7],zmm20[7]
9104 ; AVX512-NEXT: vpermt2q %zmm29, %zmm17, %zmm3
9105 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9106 ; AVX512-NEXT: vpermt2q %zmm13, %zmm17, %zmm2
9107 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9108 ; AVX512-NEXT: vpermt2q %zmm28, %zmm17, %zmm9
9109 ; AVX512-NEXT: vpermt2q %zmm4, %zmm17, %zmm5
9110 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9111 ; AVX512-NEXT: vpermt2q %zmm18, %zmm17, %zmm15
9112 ; AVX512-NEXT: vpermt2q %zmm19, %zmm17, %zmm1
9113 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9114 ; AVX512-NEXT: vpermt2q %zmm24, %zmm17, %zmm6
9115 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9116 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
9117 ; AVX512-NEXT: vpermt2q %zmm26, %zmm17, %zmm3
9118 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm14[4,5,6,7]
9119 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9120 ; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14
9121 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm12[4,5,6,7]
9122 ; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm4
9123 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm10[4,5,6,7]
9124 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm10
9125 ; AVX512-NEXT: vmovdqa64 %zmm31, %zmm25 {%k1}
9126 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm0
9127 ; AVX512-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0
9128 ; AVX512-NEXT: vmovdqa (%rdi), %xmm1
9129 ; AVX512-NEXT: vinserti128 $1, 128(%rdi), %ymm1, %ymm1
9130 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
9131 ; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2
9132 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
9133 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
9134 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1}
9135 ; AVX512-NEXT: vmovdqa 576(%rdi), %xmm5
9136 ; AVX512-NEXT: vinserti128 $1, 704(%rdi), %ymm5, %ymm5
9137 ; AVX512-NEXT: vmovdqa 512(%rdi), %xmm6
9138 ; AVX512-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6
9139 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
9140 ; AVX512-NEXT: vinserti64x4 $0, %ymm18, %zmm7, %zmm27
9141 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
9142 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
9143 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
9144 ; AVX512-NEXT: vmovdqa 1088(%rdi), %xmm13
9145 ; AVX512-NEXT: vinserti128 $1, 1216(%rdi), %ymm13, %ymm13
9146 ; AVX512-NEXT: vmovdqa64 1024(%rdi), %xmm18
9147 ; AVX512-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm18, %ymm18
9148 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2]
9149 ; AVX512-NEXT: vinserti64x4 $0, %ymm19, %zmm7, %zmm19
9150 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
9151 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
9152 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
9153 ; AVX512-NEXT: vmovdqa64 1600(%rdi), %xmm21
9154 ; AVX512-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm21, %ymm21
9155 ; AVX512-NEXT: vmovdqa64 1536(%rdi), %xmm25
9156 ; AVX512-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm25, %ymm25
9157 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm21[0],ymm25[2],ymm21[2]
9158 ; AVX512-NEXT: vinserti64x4 $0, %ymm26, %zmm7, %zmm12
9159 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
9160 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
9161 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
9162 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3]
9163 ; AVX512-NEXT: vinserti64x4 $0, %ymm13, %zmm7, %zmm13
9164 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
9165 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
9166 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
9167 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
9168 ; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5
9169 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
9170 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
9171 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1}
9172 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
9173 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0
9174 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
9175 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9176 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1}
9177 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm21[1],ymm25[3],ymm21[3]
9178 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1
9179 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
9180 ; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
9181 ; AVX512-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7]
9182 ; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm23, %zmm6
9183 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
9184 ; AVX512-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
9185 ; AVX512-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7]
9186 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
9187 ; AVX512-NEXT: vinsertf64x4 $0, %ymm7, %zmm8, %zmm7
9188 ; AVX512-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload
9189 ; AVX512-NEXT: # ymm8 = mem[0,1,2,3],ymm9[4,5,6,7]
9190 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
9191 ; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
9192 ; AVX512-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload
9193 ; AVX512-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7]
9194 ; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9
9195 ; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload
9196 ; AVX512-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7]
9197 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
9198 ; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11
9199 ; AVX512-NEXT: vmovdqa64 %zmm12, 192(%rsi)
9200 ; AVX512-NEXT: vmovdqa64 %zmm19, 128(%rsi)
9201 ; AVX512-NEXT: vmovdqa64 %zmm27, 64(%rsi)
9202 ; AVX512-NEXT: vmovdqa64 %zmm2, (%rsi)
9203 ; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rdx)
9204 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
9205 ; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rdx)
9206 ; AVX512-NEXT: vmovdqa64 %zmm13, 128(%rdx)
9207 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9208 ; AVX512-NEXT: vmovaps %zmm0, 192(%rcx)
9209 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9210 ; AVX512-NEXT: vmovaps %zmm0, (%rcx)
9211 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9212 ; AVX512-NEXT: vmovaps %zmm0, 64(%rcx)
9213 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9214 ; AVX512-NEXT: vmovaps %zmm0, 128(%rcx)
9215 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9216 ; AVX512-NEXT: vmovaps %zmm0, 192(%r8)
9217 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9218 ; AVX512-NEXT: vmovaps %zmm0, (%r8)
9219 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9220 ; AVX512-NEXT: vmovaps %zmm0, 64(%r8)
9221 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9222 ; AVX512-NEXT: vmovaps %zmm0, 128(%r8)
9223 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9224 ; AVX512-NEXT: vmovaps %zmm0, 192(%r9)
9225 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9226 ; AVX512-NEXT: vmovaps %zmm0, (%r9)
9227 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9228 ; AVX512-NEXT: vmovaps %zmm0, 64(%r9)
9229 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9230 ; AVX512-NEXT: vmovaps %zmm0, 128(%r9)
9231 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
9232 ; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
9233 ; AVX512-NEXT: vmovaps %zmm0, 192(%rax)
9234 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9235 ; AVX512-NEXT: vmovaps %zmm0, (%rax)
9236 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9237 ; AVX512-NEXT: vmovaps %zmm0, 64(%rax)
9238 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9239 ; AVX512-NEXT: vmovaps %zmm0, 128(%rax)
9240 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
9241 ; AVX512-NEXT: vmovdqa64 %zmm6, 192(%rax)
9242 ; AVX512-NEXT: vmovdqa64 %zmm10, (%rax)
9243 ; AVX512-NEXT: vmovdqa64 %zmm4, 64(%rax)
9244 ; AVX512-NEXT: vmovdqa64 %zmm14, 128(%rax)
9245 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
9246 ; AVX512-NEXT: vmovdqa64 %zmm11, 128(%rax)
9247 ; AVX512-NEXT: vmovdqa64 %zmm9, 192(%rax)
9248 ; AVX512-NEXT: vmovdqa64 %zmm8, (%rax)
9249 ; AVX512-NEXT: vmovaps %zmm7, 64(%rax)
9250 ; AVX512-NEXT: addq $3208, %rsp # imm = 0xC88
9251 ; AVX512-NEXT: vzeroupper
9254 ; AVX512-FCP-LABEL: load_i64_stride8_vf32:
9255 ; AVX512-FCP: # %bb.0:
9256 ; AVX512-FCP-NEXT: subq $3208, %rsp # imm = 0xC88
9257 ; AVX512-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm12
9258 ; AVX512-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm16
9259 ; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm19
9260 ; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm18
9261 ; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm24
9262 ; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm27
9263 ; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm7
9264 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9265 ; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11
9266 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9267 ; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm31
9268 ; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm3
9269 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm20
9270 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8
9271 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9272 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm25
9273 ; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9274 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm23
9275 ; AVX512-FCP-NEXT: movb $-64, %al
9276 ; AVX512-FCP-NEXT: kmovw %eax, %k1
9277 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10]
9278 ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9279 ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm1
9280 ; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm1
9281 ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm2
9282 ; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9283 ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2
9284 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
9285 ; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %ymm22
9286 ; AVX512-FCP-NEXT: vmovdqa 1152(%rdi), %ymm10
9287 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm10[0],ymm22[0],ymm10[2],ymm22[2]
9288 ; AVX512-FCP-NEXT: vmovdqa 1088(%rdi), %ymm9
9289 ; AVX512-FCP-NEXT: vmovdqa 1024(%rdi), %ymm6
9290 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2]
9291 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3]
9292 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1
9293 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9294 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2
9295 ; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2
9296 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm4
9297 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm4
9298 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1}
9299 ; AVX512-FCP-NEXT: vmovdqa 704(%rdi), %ymm7
9300 ; AVX512-FCP-NEXT: vmovdqa 640(%rdi), %ymm13
9301 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm13[0],ymm7[0],ymm13[2],ymm7[2]
9302 ; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %ymm17
9303 ; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %ymm21
9304 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2]
9305 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3]
9306 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm1
9307 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9308 ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm4
9309 ; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm4
9310 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm11
9311 ; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm11
9312 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1}
9313 ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm4
9314 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm14
9315 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
9316 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm25
9317 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm28
9318 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm25[0],ymm28[2],ymm25[2]
9319 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
9320 ; AVX512-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm29
9321 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1
9322 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9323 ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm1
9324 ; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1
9325 ; AVX512-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26
9326 ; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm26, %zmm0
9327 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
9328 ; AVX512-FCP-NEXT: vmovdqa 1728(%rdi), %ymm2
9329 ; AVX512-FCP-NEXT: vmovdqa 1664(%rdi), %ymm8
9330 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm2[0],ymm8[2],ymm2[2]
9331 ; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %ymm30
9332 ; AVX512-FCP-NEXT: vmovdqa 1536(%rdi), %ymm1
9333 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm30[0],ymm1[2],ymm30[2]
9334 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3]
9335 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0
9336 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9337 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
9338 ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9339 ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm11
9340 ; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm11
9341 ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm18
9342 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1}
9343 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3]
9344 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3]
9345 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3]
9346 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5
9347 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9348 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5
9349 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
9350 ; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm5
9351 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
9352 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm6
9353 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
9354 ; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm6
9355 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
9356 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm13[1],ymm7[1],ymm13[3],ymm7[3]
9357 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm21[1],ymm17[1],ymm21[3],ymm17[3]
9358 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3]
9359 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3
9360 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9361 ; AVX512-FCP-NEXT: vmovdqu64 %zmm23, (%rsp) # 64-byte Spill
9362 ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm3
9363 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
9364 ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm3
9365 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
9366 ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm5
9367 ; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm5
9368 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1}
9369 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
9370 ; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm7
9371 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3]
9372 ; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6
9373 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
9374 ; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm13
9375 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3
9376 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9377 ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm15
9378 ; AVX512-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9379 ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm3
9380 ; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9381 ; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm3
9382 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm9
9383 ; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9384 ; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm26, %zmm0
9385 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1}
9386 ; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm4
9387 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3]
9388 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3]
9389 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
9390 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
9391 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9392 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
9393 ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9394 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1
9395 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9396 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1
9397 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm8
9398 ; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9399 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm2
9400 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm5
9401 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9402 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
9403 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9404 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
9405 ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm2
9406 ; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
9407 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
9408 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6]
9409 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
9410 ; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm25
9411 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
9412 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9413 ; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm1
9414 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9415 ; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm1
9416 ; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm26
9417 ; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm14
9418 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm2
9419 ; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9420 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm2
9421 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
9422 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm2
9423 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm29
9424 ; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2
9425 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6]
9426 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm10
9427 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
9428 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
9429 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9430 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2
9431 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9432 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
9433 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9434 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
9435 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6
9436 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9437 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
9438 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9439 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2
9440 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
9441 ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm23
9442 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6]
9443 ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm22
9444 ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 {%k1}
9445 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
9446 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9447 ; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm18
9448 ; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm1
9449 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9450 ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm1
9451 ; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm13
9452 ; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9453 ; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm6
9454 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm2
9455 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9456 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm2
9457 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
9458 ; AVX512-FCP-NEXT: vpermi2q %zmm16, %zmm15, %zmm0
9459 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm9[0],zmm12[0],zmm9[2],zmm12[2],zmm9[4],zmm12[4],zmm9[6],zmm12[6]
9460 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
9461 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0
9462 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9463 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,13,5,13,5,13,5,13]
9464 ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9465 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm4
9466 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1
9467 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm1
9468 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7]
9469 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [0,8,0,8,0,8,0,8]
9470 ; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9471 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm9
9472 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm0
9473 ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm0
9474 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9475 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [1,9,1,9,1,9,1,9]
9476 ; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9477 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm0
9478 ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm16, %zmm0
9479 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9480 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14]
9481 ; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9482 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm0
9483 ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm0
9484 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
9485 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15]
9486 ; AVX512-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9487 ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm9
9488 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm27[0],zmm24[0],zmm27[2],zmm24[2],zmm27[4],zmm24[4],zmm27[6],zmm24[6]
9489 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9490 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm27[1],zmm24[1],zmm27[3],zmm24[3],zmm27[5],zmm24[5],zmm27[7],zmm24[7]
9491 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9492 ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm0
9493 ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm8
9494 ; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm21, %zmm27
9495 ; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9496 ; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm16, %zmm0
9497 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9498 ; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm8
9499 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
9500 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm0
9501 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9502 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
9503 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm7
9504 ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm9
9505 ; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm2, %zmm7
9506 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm13
9507 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm14
9508 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm7[4,5,6,7]
9509 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0
9510 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm4
9511 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm4
9512 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9513 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm4
9514 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm4
9515 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9516 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm30
9517 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm30
9518 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7]
9519 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm17, %zmm0
9520 ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm4
9521 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6]
9522 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7]
9523 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9524 ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm0
9525 ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm14
9526 ; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm21, %zmm4
9527 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9528 ; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm16, %zmm0
9529 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9530 ; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm2, %zmm14
9531 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1}
9532 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm0
9533 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9534 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
9535 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
9536 ; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm8
9537 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
9538 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm11
9539 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm11 # 64-byte Folded Reload
9540 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4,5,6,7]
9541 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0
9542 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm25
9543 ; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm25
9544 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm5
9545 ; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm5
9546 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9547 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm27
9548 ; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm27
9549 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm20[1],zmm22[3],zmm20[3],zmm22[5],zmm20[5],zmm22[7],zmm20[7]
9550 ; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm17, %zmm0
9551 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9552 ; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm31 # 64-byte Reload
9553 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6]
9554 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm31[1],zmm1[1],zmm31[3],zmm1[3],zmm31[5],zmm1[5],zmm31[7],zmm1[7]
9555 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9556 ; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm0
9557 ; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm24
9558 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm31
9559 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm0
9560 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9561 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm24
9562 ; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm14 {%k1}
9563 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm0
9564 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9565 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
9566 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm11
9567 ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm11
9568 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
9569 ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm6
9570 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm11[4,5,6,7]
9571 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
9572 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
9573 ; AVX512-FCP-NEXT: vpermi2q %zmm20, %zmm5, %zmm2
9574 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
9575 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7]
9576 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1}
9577 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm2
9578 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
9579 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm2
9580 ; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm2
9581 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9582 ; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm22, %zmm21
9583 ; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9584 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm2
9585 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm21
9586 ; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm2
9587 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9588 ; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm22, %zmm16
9589 ; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9590 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm16
9591 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
9592 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm14
9593 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
9594 ; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm23, %zmm14
9595 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
9596 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
9597 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm11
9598 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm12
9599 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm12
9600 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm29
9601 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
9602 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
9603 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm8
9604 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
9605 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm10
9606 ; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm10
9607 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5
9608 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm7
9609 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
9610 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm23, %zmm7
9611 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm0
9612 ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm0
9613 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9614 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9615 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0
9616 ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm0
9617 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9618 ; AVX512-FCP-NEXT: vpermi2q %zmm16, %zmm22, %zmm23
9619 ; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm17, %zmm22
9620 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6]
9621 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm21[1],zmm20[1],zmm21[3],zmm20[3],zmm21[5],zmm20[5],zmm21[7],zmm20[7]
9622 ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm17, %zmm3
9623 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9624 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm17, %zmm2
9625 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9626 ; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm17, %zmm9
9627 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm5
9628 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9629 ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm17, %zmm15
9630 ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm1
9631 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9632 ; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm17, %zmm6
9633 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9634 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
9635 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm17, %zmm3
9636 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm14[4,5,6,7]
9637 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9638 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14
9639 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm12[4,5,6,7]
9640 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm4
9641 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm10[4,5,6,7]
9642 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm10
9643 ; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm25 {%k1}
9644 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm0
9645 ; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0
9646 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1
9647 ; AVX512-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm1, %ymm1
9648 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
9649 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2
9650 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
9651 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
9652 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1}
9653 ; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %xmm5
9654 ; AVX512-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm5, %ymm5
9655 ; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %xmm6
9656 ; AVX512-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6
9657 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
9658 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm18, %zmm7, %zmm27
9659 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
9660 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
9661 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
9662 ; AVX512-FCP-NEXT: vmovdqa 1088(%rdi), %xmm13
9663 ; AVX512-FCP-NEXT: vinserti128 $1, 1216(%rdi), %ymm13, %ymm13
9664 ; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %xmm18
9665 ; AVX512-FCP-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm18, %ymm18
9666 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2]
9667 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm7, %zmm19
9668 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
9669 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
9670 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
9671 ; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %xmm21
9672 ; AVX512-FCP-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm21, %ymm21
9673 ; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %xmm25
9674 ; AVX512-FCP-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm25, %ymm25
9675 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm21[0],ymm25[2],ymm21[2]
9676 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm26, %zmm7, %zmm12
9677 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
9678 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
9679 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
9680 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3]
9681 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm7, %zmm13
9682 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
9683 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
9684 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
9685 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
9686 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5
9687 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
9688 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
9689 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1}
9690 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
9691 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0
9692 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
9693 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9694 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1}
9695 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm21[1],ymm25[3],ymm21[3]
9696 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1
9697 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
9698 ; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
9699 ; AVX512-FCP-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7]
9700 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm23, %zmm6
9701 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
9702 ; AVX512-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
9703 ; AVX512-FCP-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7]
9704 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
9705 ; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm7, %zmm8, %zmm7
9706 ; AVX512-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload
9707 ; AVX512-FCP-NEXT: # ymm8 = mem[0,1,2,3],ymm9[4,5,6,7]
9708 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
9709 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
9710 ; AVX512-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload
9711 ; AVX512-FCP-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7]
9712 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9
9713 ; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload
9714 ; AVX512-FCP-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7]
9715 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
9716 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11
9717 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 192(%rsi)
9718 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 128(%rsi)
9719 ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 64(%rsi)
9720 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rsi)
9721 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 192(%rdx)
9722 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
9723 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rdx)
9724 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 128(%rdx)
9725 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9726 ; AVX512-FCP-NEXT: vmovaps %zmm0, 192(%rcx)
9727 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9728 ; AVX512-FCP-NEXT: vmovaps %zmm0, (%rcx)
9729 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9730 ; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%rcx)
9731 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9732 ; AVX512-FCP-NEXT: vmovaps %zmm0, 128(%rcx)
9733 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9734 ; AVX512-FCP-NEXT: vmovaps %zmm0, 192(%r8)
9735 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9736 ; AVX512-FCP-NEXT: vmovaps %zmm0, (%r8)
9737 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9738 ; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%r8)
9739 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9740 ; AVX512-FCP-NEXT: vmovaps %zmm0, 128(%r8)
9741 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9742 ; AVX512-FCP-NEXT: vmovaps %zmm0, 192(%r9)
9743 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9744 ; AVX512-FCP-NEXT: vmovaps %zmm0, (%r9)
9745 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9746 ; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%r9)
9747 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9748 ; AVX512-FCP-NEXT: vmovaps %zmm0, 128(%r9)
9749 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
9750 ; AVX512-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
9751 ; AVX512-FCP-NEXT: vmovaps %zmm0, 192(%rax)
9752 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9753 ; AVX512-FCP-NEXT: vmovaps %zmm0, (%rax)
9754 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9755 ; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%rax)
9756 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9757 ; AVX512-FCP-NEXT: vmovaps %zmm0, 128(%rax)
9758 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
9759 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 192(%rax)
9760 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rax)
9761 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax)
9762 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 128(%rax)
9763 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
9764 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 128(%rax)
9765 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 192(%rax)
9766 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%rax)
9767 ; AVX512-FCP-NEXT: vmovaps %zmm7, 64(%rax)
9768 ; AVX512-FCP-NEXT: addq $3208, %rsp # imm = 0xC88
9769 ; AVX512-FCP-NEXT: vzeroupper
9770 ; AVX512-FCP-NEXT: retq
9772 ; AVX512DQ-LABEL: load_i64_stride8_vf32:
9773 ; AVX512DQ: # %bb.0:
9774 ; AVX512DQ-NEXT: subq $3208, %rsp # imm = 0xC88
9775 ; AVX512DQ-NEXT: vmovdqa64 1856(%rdi), %zmm12
9776 ; AVX512DQ-NEXT: vmovdqa64 1984(%rdi), %zmm16
9777 ; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm19
9778 ; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm18
9779 ; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm24
9780 ; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm27
9781 ; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm7
9782 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9783 ; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm11
9784 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9785 ; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm31
9786 ; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm3
9787 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm20
9788 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm8
9789 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9790 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm25
9791 ; AVX512DQ-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9792 ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm23
9793 ; AVX512DQ-NEXT: movb $-64, %al
9794 ; AVX512DQ-NEXT: kmovw %eax, %k1
9795 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10]
9796 ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9797 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm1
9798 ; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm0, %zmm1
9799 ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm2
9800 ; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9801 ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm0, %zmm2
9802 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
9803 ; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %ymm22
9804 ; AVX512DQ-NEXT: vmovdqa 1152(%rdi), %ymm10
9805 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm10[0],ymm22[0],ymm10[2],ymm22[2]
9806 ; AVX512DQ-NEXT: vmovdqa 1088(%rdi), %ymm9
9807 ; AVX512DQ-NEXT: vmovdqa 1024(%rdi), %ymm6
9808 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2]
9809 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3]
9810 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1
9811 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9812 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm2
9813 ; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm0, %zmm2
9814 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm4
9815 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm0, %zmm4
9816 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1}
9817 ; AVX512DQ-NEXT: vmovdqa 704(%rdi), %ymm7
9818 ; AVX512DQ-NEXT: vmovdqa 640(%rdi), %ymm13
9819 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm13[0],ymm7[0],ymm13[2],ymm7[2]
9820 ; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %ymm17
9821 ; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %ymm21
9822 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2]
9823 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3]
9824 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm1
9825 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9826 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm4
9827 ; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm0, %zmm4
9828 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm11
9829 ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm0, %zmm11
9830 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1}
9831 ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm4
9832 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm14
9833 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
9834 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm25
9835 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm28
9836 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm25[0],ymm28[2],ymm25[2]
9837 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
9838 ; AVX512DQ-NEXT: vmovdqa64 1920(%rdi), %zmm29
9839 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1
9840 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9841 ; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm1
9842 ; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm0, %zmm1
9843 ; AVX512DQ-NEXT: vmovdqa64 1792(%rdi), %zmm26
9844 ; AVX512DQ-NEXT: vpermi2q %zmm12, %zmm26, %zmm0
9845 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
9846 ; AVX512DQ-NEXT: vmovdqa 1728(%rdi), %ymm2
9847 ; AVX512DQ-NEXT: vmovdqa 1664(%rdi), %ymm8
9848 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm2[0],ymm8[2],ymm2[2]
9849 ; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %ymm30
9850 ; AVX512DQ-NEXT: vmovdqa 1536(%rdi), %ymm1
9851 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm30[0],ymm1[2],ymm30[2]
9852 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3]
9853 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0
9854 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9855 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
9856 ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9857 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm11
9858 ; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm0, %zmm11
9859 ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm0, %zmm18
9860 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1}
9861 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3]
9862 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3]
9863 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3]
9864 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5
9865 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9866 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5
9867 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10
9868 ; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm0, %zmm5
9869 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
9870 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm6
9871 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
9872 ; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm0, %zmm6
9873 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
9874 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm13[1],ymm7[1],ymm13[3],ymm7[3]
9875 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm21[1],ymm17[1],ymm21[3],ymm17[3]
9876 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3]
9877 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3
9878 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9879 ; AVX512DQ-NEXT: vmovdqu64 %zmm23, (%rsp) # 64-byte Spill
9880 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm3
9881 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
9882 ; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm0, %zmm3
9883 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
9884 ; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm5
9885 ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm0, %zmm5
9886 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1}
9887 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
9888 ; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm7
9889 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3]
9890 ; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm6
9891 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
9892 ; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm13
9893 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3
9894 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9895 ; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm15
9896 ; AVX512DQ-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9897 ; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm3
9898 ; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9899 ; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm0, %zmm3
9900 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm9
9901 ; AVX512DQ-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9902 ; AVX512DQ-NEXT: vpermi2q %zmm12, %zmm26, %zmm0
9903 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1}
9904 ; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm4
9905 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3]
9906 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3]
9907 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
9908 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
9909 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9910 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
9911 ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9912 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1
9913 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9914 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm0, %zmm1
9915 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm8
9916 ; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9917 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm2
9918 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm5
9919 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9920 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
9921 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9922 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
9923 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm2
9924 ; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
9925 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
9926 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6]
9927 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
9928 ; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm25
9929 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
9930 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9931 ; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm1
9932 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9933 ; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm0, %zmm1
9934 ; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm26
9935 ; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm14
9936 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm2
9937 ; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9938 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm0, %zmm2
9939 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
9940 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm2
9941 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm29
9942 ; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm0, %zmm2
9943 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6]
9944 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm10
9945 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
9946 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
9947 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9948 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm2
9949 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9950 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm1
9951 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9952 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
9953 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm6
9954 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9955 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2
9956 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9957 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm0, %zmm2
9958 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
9959 ; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm0, %zmm23
9960 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6]
9961 ; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm22
9962 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm3 {%k1}
9963 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
9964 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9965 ; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm18
9966 ; AVX512DQ-NEXT: vmovdqa64 1664(%rdi), %zmm1
9967 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9968 ; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm0, %zmm1
9969 ; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm13
9970 ; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9971 ; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %zmm6
9972 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm2
9973 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9974 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm0, %zmm2
9975 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
9976 ; AVX512DQ-NEXT: vpermi2q %zmm16, %zmm15, %zmm0
9977 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm9[0],zmm12[0],zmm9[2],zmm12[2],zmm9[4],zmm12[4],zmm9[6],zmm12[6]
9978 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
9979 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0
9980 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9981 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,13,5,13,5,13,5,13]
9982 ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9983 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm2, %zmm4
9984 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1
9985 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm2, %zmm1
9986 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7]
9987 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [0,8,0,8,0,8,0,8]
9988 ; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9989 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm9
9990 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm0
9991 ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm21, %zmm0
9992 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9993 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [1,9,1,9,1,9,1,9]
9994 ; AVX512DQ-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9995 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm0
9996 ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm16, %zmm0
9997 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9998 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14]
9999 ; AVX512DQ-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
10000 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm0
10001 ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm23, %zmm0
10002 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
10003 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15]
10004 ; AVX512DQ-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
10005 ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm17, %zmm9
10006 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm27[0],zmm24[0],zmm27[2],zmm24[2],zmm27[4],zmm24[4],zmm27[6],zmm24[6]
10007 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10008 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm27[1],zmm24[1],zmm27[3],zmm24[3],zmm27[5],zmm24[5],zmm27[7],zmm24[7]
10009 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10010 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm0
10011 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm8
10012 ; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm21, %zmm27
10013 ; AVX512DQ-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10014 ; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm16, %zmm0
10015 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10016 ; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm2, %zmm8
10017 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
10018 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm0
10019 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10020 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
10021 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm7
10022 ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm9
10023 ; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm2, %zmm7
10024 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm13
10025 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm2, %zmm14
10026 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm7[4,5,6,7]
10027 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0
10028 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm4
10029 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm21, %zmm4
10030 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10031 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm4
10032 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm16, %zmm4
10033 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10034 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm30
10035 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm23, %zmm30
10036 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7]
10037 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm17, %zmm0
10038 ; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm4
10039 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6]
10040 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7]
10041 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10042 ; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm0
10043 ; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm14
10044 ; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm21, %zmm4
10045 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10046 ; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm16, %zmm0
10047 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10048 ; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm2, %zmm14
10049 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1}
10050 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm0
10051 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10052 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
10053 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
10054 ; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm2, %zmm8
10055 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
10056 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm11
10057 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm11 # 64-byte Folded Reload
10058 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4,5,6,7]
10059 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm0
10060 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm25
10061 ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm21, %zmm25
10062 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm5
10063 ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm16, %zmm5
10064 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10065 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm27
10066 ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm23, %zmm27
10067 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm20[1],zmm22[3],zmm20[3],zmm22[5],zmm20[5],zmm22[7],zmm20[7]
10068 ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm17, %zmm0
10069 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
10070 ; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm31 # 64-byte Reload
10071 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6]
10072 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm31[1],zmm1[1],zmm31[3],zmm1[3],zmm31[5],zmm1[5],zmm31[7],zmm1[7]
10073 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10074 ; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm0
10075 ; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm24
10076 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm21, %zmm31
10077 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm16, %zmm0
10078 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10079 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm24
10080 ; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm14 {%k1}
10081 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm0
10082 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10083 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
10084 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm11
10085 ; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm2, %zmm11
10086 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
10087 ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm2, %zmm6
10088 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm11[4,5,6,7]
10089 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
10090 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
10091 ; AVX512DQ-NEXT: vpermi2q %zmm20, %zmm5, %zmm2
10092 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
10093 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7]
10094 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1}
10095 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm2
10096 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
10097 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm2
10098 ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm21, %zmm2
10099 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10100 ; AVX512DQ-NEXT: vpermi2q %zmm12, %zmm22, %zmm21
10101 ; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10102 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm2
10103 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm21
10104 ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm16, %zmm2
10105 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10106 ; AVX512DQ-NEXT: vpermi2q %zmm12, %zmm22, %zmm16
10107 ; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10108 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm16
10109 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
10110 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm14
10111 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
10112 ; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm23, %zmm14
10113 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
10114 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
10115 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm23, %zmm11
10116 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm12
10117 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm23, %zmm12
10118 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm29
10119 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
10120 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8
10121 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm23, %zmm8
10122 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
10123 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm10
10124 ; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm23, %zmm10
10125 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5
10126 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm7
10127 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
10128 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm23, %zmm7
10129 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm0
10130 ; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm23, %zmm0
10131 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10132 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
10133 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0
10134 ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm23, %zmm0
10135 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10136 ; AVX512DQ-NEXT: vpermi2q %zmm16, %zmm22, %zmm23
10137 ; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm17, %zmm22
10138 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6]
10139 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm21[1],zmm20[1],zmm21[3],zmm20[3],zmm21[5],zmm20[5],zmm21[7],zmm20[7]
10140 ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm17, %zmm3
10141 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10142 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm17, %zmm2
10143 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10144 ; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm17, %zmm9
10145 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm17, %zmm5
10146 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10147 ; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm17, %zmm15
10148 ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm17, %zmm1
10149 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10150 ; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm17, %zmm6
10151 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10152 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
10153 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm17, %zmm3
10154 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm14[4,5,6,7]
10155 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10156 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14
10157 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm12[4,5,6,7]
10158 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm4
10159 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm10[4,5,6,7]
10160 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm10
10161 ; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm25 {%k1}
10162 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm0
10163 ; AVX512DQ-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0
10164 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
10165 ; AVX512DQ-NEXT: vinserti128 $1, 128(%rdi), %ymm1, %ymm1
10166 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
10167 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2
10168 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
10169 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
10170 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1}
10171 ; AVX512DQ-NEXT: vmovdqa 576(%rdi), %xmm5
10172 ; AVX512DQ-NEXT: vinserti128 $1, 704(%rdi), %ymm5, %ymm5
10173 ; AVX512DQ-NEXT: vmovdqa 512(%rdi), %xmm6
10174 ; AVX512DQ-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6
10175 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
10176 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm18, %zmm7, %zmm27
10177 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
10178 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
10179 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
10180 ; AVX512DQ-NEXT: vmovdqa 1088(%rdi), %xmm13
10181 ; AVX512DQ-NEXT: vinserti128 $1, 1216(%rdi), %ymm13, %ymm13
10182 ; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %xmm18
10183 ; AVX512DQ-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm18, %ymm18
10184 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2]
10185 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm19, %zmm7, %zmm19
10186 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
10187 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
10188 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
10189 ; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %xmm21
10190 ; AVX512DQ-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm21, %ymm21
10191 ; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %xmm25
10192 ; AVX512DQ-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm25, %ymm25
10193 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm21[0],ymm25[2],ymm21[2]
10194 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm26, %zmm7, %zmm12
10195 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
10196 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
10197 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
10198 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3]
10199 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm13, %zmm7, %zmm13
10200 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
10201 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
10202 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
10203 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
10204 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5
10205 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
10206 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
10207 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1}
10208 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
10209 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0
10210 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
10211 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
10212 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1}
10213 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm21[1],ymm25[3],ymm21[3]
10214 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1
10215 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
10216 ; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
10217 ; AVX512DQ-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7]
10218 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm23, %zmm6
10219 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
10220 ; AVX512DQ-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
10221 ; AVX512DQ-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7]
10222 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
10223 ; AVX512DQ-NEXT: vinsertf64x4 $0, %ymm7, %zmm8, %zmm7
10224 ; AVX512DQ-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload
10225 ; AVX512DQ-NEXT: # ymm8 = mem[0,1,2,3],ymm9[4,5,6,7]
10226 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
10227 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
10228 ; AVX512DQ-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload
10229 ; AVX512DQ-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7]
10230 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9
10231 ; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload
10232 ; AVX512DQ-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7]
10233 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
10234 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11
10235 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, 192(%rsi)
10236 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, 128(%rsi)
10237 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, 64(%rsi)
10238 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rsi)
10239 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 192(%rdx)
10240 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
10241 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rdx)
10242 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 128(%rdx)
10243 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10244 ; AVX512DQ-NEXT: vmovaps %zmm0, 192(%rcx)
10245 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10246 ; AVX512DQ-NEXT: vmovaps %zmm0, (%rcx)
10247 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10248 ; AVX512DQ-NEXT: vmovaps %zmm0, 64(%rcx)
10249 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10250 ; AVX512DQ-NEXT: vmovaps %zmm0, 128(%rcx)
10251 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10252 ; AVX512DQ-NEXT: vmovaps %zmm0, 192(%r8)
10253 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10254 ; AVX512DQ-NEXT: vmovaps %zmm0, (%r8)
10255 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10256 ; AVX512DQ-NEXT: vmovaps %zmm0, 64(%r8)
10257 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10258 ; AVX512DQ-NEXT: vmovaps %zmm0, 128(%r8)
10259 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10260 ; AVX512DQ-NEXT: vmovaps %zmm0, 192(%r9)
10261 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10262 ; AVX512DQ-NEXT: vmovaps %zmm0, (%r9)
10263 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10264 ; AVX512DQ-NEXT: vmovaps %zmm0, 64(%r9)
10265 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10266 ; AVX512DQ-NEXT: vmovaps %zmm0, 128(%r9)
10267 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
10268 ; AVX512DQ-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
10269 ; AVX512DQ-NEXT: vmovaps %zmm0, 192(%rax)
10270 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10271 ; AVX512DQ-NEXT: vmovaps %zmm0, (%rax)
10272 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10273 ; AVX512DQ-NEXT: vmovaps %zmm0, 64(%rax)
10274 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10275 ; AVX512DQ-NEXT: vmovaps %zmm0, 128(%rax)
10276 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
10277 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 192(%rax)
10278 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rax)
10279 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rax)
10280 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, 128(%rax)
10281 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
10282 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 128(%rax)
10283 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 192(%rax)
10284 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rax)
10285 ; AVX512DQ-NEXT: vmovaps %zmm7, 64(%rax)
10286 ; AVX512DQ-NEXT: addq $3208, %rsp # imm = 0xC88
10287 ; AVX512DQ-NEXT: vzeroupper
10288 ; AVX512DQ-NEXT: retq
10290 ; AVX512DQ-FCP-LABEL: load_i64_stride8_vf32:
10291 ; AVX512DQ-FCP: # %bb.0:
10292 ; AVX512DQ-FCP-NEXT: subq $3208, %rsp # imm = 0xC88
10293 ; AVX512DQ-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm12
10294 ; AVX512DQ-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm16
10295 ; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm19
10296 ; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm18
10297 ; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm24
10298 ; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm27
10299 ; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm7
10300 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10301 ; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11
10302 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10303 ; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm31
10304 ; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm3
10305 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm20
10306 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8
10307 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10308 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm25
10309 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10310 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm23
10311 ; AVX512DQ-FCP-NEXT: movb $-64, %al
10312 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
10313 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10]
10314 ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
10315 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm1
10316 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm1
10317 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm2
10318 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10319 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2
10320 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
10321 ; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %ymm22
10322 ; AVX512DQ-FCP-NEXT: vmovdqa 1152(%rdi), %ymm10
10323 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm10[0],ymm22[0],ymm10[2],ymm22[2]
10324 ; AVX512DQ-FCP-NEXT: vmovdqa 1088(%rdi), %ymm9
10325 ; AVX512DQ-FCP-NEXT: vmovdqa 1024(%rdi), %ymm6
10326 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2]
10327 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3]
10328 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1
10329 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10330 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2
10331 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2
10332 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm4
10333 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm4
10334 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1}
10335 ; AVX512DQ-FCP-NEXT: vmovdqa 704(%rdi), %ymm7
10336 ; AVX512DQ-FCP-NEXT: vmovdqa 640(%rdi), %ymm13
10337 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm13[0],ymm7[0],ymm13[2],ymm7[2]
10338 ; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %ymm17
10339 ; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %ymm21
10340 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2]
10341 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3]
10342 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm1
10343 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10344 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm4
10345 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm4
10346 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm11
10347 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm11
10348 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1}
10349 ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm4
10350 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm14
10351 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
10352 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm25
10353 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm28
10354 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm25[0],ymm28[2],ymm25[2]
10355 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
10356 ; AVX512DQ-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm29
10357 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1
10358 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10359 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm1
10360 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1
10361 ; AVX512DQ-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26
10362 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm26, %zmm0
10363 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
10364 ; AVX512DQ-FCP-NEXT: vmovdqa 1728(%rdi), %ymm2
10365 ; AVX512DQ-FCP-NEXT: vmovdqa 1664(%rdi), %ymm8
10366 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm2[0],ymm8[2],ymm2[2]
10367 ; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %ymm30
10368 ; AVX512DQ-FCP-NEXT: vmovdqa 1536(%rdi), %ymm1
10369 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm30[0],ymm1[2],ymm30[2]
10370 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3]
10371 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0
10372 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10373 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
10374 ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
10375 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm11
10376 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm11
10377 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm18
10378 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1}
10379 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3]
10380 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3]
10381 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3]
10382 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5
10383 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10384 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5
10385 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
10386 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm5
10387 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
10388 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm6
10389 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
10390 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm6
10391 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
10392 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm13[1],ymm7[1],ymm13[3],ymm7[3]
10393 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm21[1],ymm17[1],ymm21[3],ymm17[3]
10394 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3]
10395 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3
10396 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10397 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, (%rsp) # 64-byte Spill
10398 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm3
10399 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
10400 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm3
10401 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
10402 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm5
10403 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm5
10404 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1}
10405 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
10406 ; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm7
10407 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3]
10408 ; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6
10409 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
10410 ; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm13
10411 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3
10412 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10413 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm15
10414 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10415 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm3
10416 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10417 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm3
10418 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm9
10419 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10420 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm26, %zmm0
10421 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1}
10422 ; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm4
10423 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3]
10424 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3]
10425 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
10426 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
10427 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10428 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
10429 ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
10430 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm1
10431 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10432 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1
10433 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm8
10434 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10435 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm2
10436 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm5
10437 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10438 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
10439 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10440 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
10441 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm2
10442 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
10443 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
10444 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6]
10445 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
10446 ; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm25
10447 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
10448 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10449 ; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm1
10450 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10451 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm1
10452 ; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm26
10453 ; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm14
10454 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm2
10455 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10456 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm2
10457 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
10458 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm2
10459 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm29
10460 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2
10461 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6]
10462 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm10
10463 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
10464 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
10465 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10466 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2
10467 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10468 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
10469 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10470 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
10471 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6
10472 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10473 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
10474 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10475 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2
10476 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
10477 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm23
10478 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6]
10479 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm22
10480 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 {%k1}
10481 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
10482 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10483 ; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm18
10484 ; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm1
10485 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10486 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm1
10487 ; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm13
10488 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10489 ; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm6
10490 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm2
10491 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10492 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm2
10493 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
10494 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm16, %zmm15, %zmm0
10495 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm9[0],zmm12[0],zmm9[2],zmm12[2],zmm9[4],zmm12[4],zmm9[6],zmm12[6]
10496 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
10497 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0
10498 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10499 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,13,5,13,5,13,5,13]
10500 ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
10501 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm4
10502 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1
10503 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm1
10504 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7]
10505 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [0,8,0,8,0,8,0,8]
10506 ; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
10507 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm9
10508 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm0
10509 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm0
10510 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10511 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [1,9,1,9,1,9,1,9]
10512 ; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
10513 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm0
10514 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm16, %zmm0
10515 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10516 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14]
10517 ; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
10518 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm0
10519 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm0
10520 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
10521 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15]
10522 ; AVX512DQ-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
10523 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm9
10524 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm27[0],zmm24[0],zmm27[2],zmm24[2],zmm27[4],zmm24[4],zmm27[6],zmm24[6]
10525 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10526 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm27[1],zmm24[1],zmm27[3],zmm24[3],zmm27[5],zmm24[5],zmm27[7],zmm24[7]
10527 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10528 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm0
10529 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm8
10530 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm21, %zmm27
10531 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10532 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm16, %zmm0
10533 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10534 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm8
10535 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
10536 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm0
10537 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10538 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
10539 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm7
10540 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm9
10541 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm2, %zmm7
10542 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm13
10543 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm14
10544 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm7[4,5,6,7]
10545 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0
10546 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm4
10547 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm4
10548 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10549 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm4
10550 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm4
10551 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10552 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm30
10553 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm30
10554 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7]
10555 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm17, %zmm0
10556 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm4
10557 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6]
10558 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7]
10559 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10560 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm0
10561 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm14
10562 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm21, %zmm4
10563 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10564 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm16, %zmm0
10565 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10566 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm2, %zmm14
10567 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1}
10568 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm0
10569 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10570 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
10571 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
10572 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm8
10573 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
10574 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm11
10575 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm11 # 64-byte Folded Reload
10576 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4,5,6,7]
10577 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm0
10578 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm25
10579 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm25
10580 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm5
10581 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm5
10582 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10583 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm27
10584 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm27
10585 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm20[1],zmm22[3],zmm20[3],zmm22[5],zmm20[5],zmm22[7],zmm20[7]
10586 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm17, %zmm0
10587 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
10588 ; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm31 # 64-byte Reload
10589 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6]
10590 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm31[1],zmm1[1],zmm31[3],zmm1[3],zmm31[5],zmm1[5],zmm31[7],zmm1[7]
10591 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10592 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm0
10593 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm24
10594 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm31
10595 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm0
10596 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10597 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm24
10598 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm14 {%k1}
10599 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm0
10600 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10601 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
10602 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm11
10603 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm11
10604 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
10605 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm6
10606 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm11[4,5,6,7]
10607 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
10608 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
10609 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm20, %zmm5, %zmm2
10610 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
10611 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7]
10612 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1}
10613 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm2
10614 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
10615 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm2
10616 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm2
10617 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10618 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm22, %zmm21
10619 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10620 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm2
10621 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm21
10622 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm2
10623 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10624 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm22, %zmm16
10625 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10626 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm16
10627 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
10628 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm14
10629 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
10630 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm23, %zmm14
10631 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
10632 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
10633 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm11
10634 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm12
10635 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm12
10636 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm29
10637 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
10638 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
10639 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm8
10640 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
10641 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm10
10642 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm10
10643 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5
10644 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm7
10645 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
10646 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm23, %zmm7
10647 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm0
10648 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm0
10649 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10650 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
10651 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0
10652 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm0
10653 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10654 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm16, %zmm22, %zmm23
10655 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm17, %zmm22
10656 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6]
10657 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm21[1],zmm20[1],zmm21[3],zmm20[3],zmm21[5],zmm20[5],zmm21[7],zmm20[7]
10658 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm17, %zmm3
10659 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10660 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm17, %zmm2
10661 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10662 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm17, %zmm9
10663 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm5
10664 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10665 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm17, %zmm15
10666 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm1
10667 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10668 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm17, %zmm6
10669 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10670 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
10671 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm17, %zmm3
10672 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm14[4,5,6,7]
10673 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10674 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14
10675 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm12[4,5,6,7]
10676 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm4
10677 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm10[4,5,6,7]
10678 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm10
10679 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm25 {%k1}
10680 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm0
10681 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0
10682 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1
10683 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm1, %ymm1
10684 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
10685 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2
10686 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
10687 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
10688 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1}
10689 ; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %xmm5
10690 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm5, %ymm5
10691 ; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %xmm6
10692 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6
10693 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
10694 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm18, %zmm7, %zmm27
10695 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
10696 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
10697 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
10698 ; AVX512DQ-FCP-NEXT: vmovdqa 1088(%rdi), %xmm13
10699 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 1216(%rdi), %ymm13, %ymm13
10700 ; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %xmm18
10701 ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm18, %ymm18
10702 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2]
10703 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm7, %zmm19
10704 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
10705 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
10706 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
10707 ; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %xmm21
10708 ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm21, %ymm21
10709 ; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %xmm25
10710 ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm25, %ymm25
10711 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm21[0],ymm25[2],ymm21[2]
10712 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm26, %zmm7, %zmm12
10713 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
10714 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
10715 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
10716 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3]
10717 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm7, %zmm13
10718 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
10719 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
10720 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
10721 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
10722 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5
10723 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
10724 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
10725 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1}
10726 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
10727 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0
10728 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
10729 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
10730 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1}
10731 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm21[1],ymm25[3],ymm21[3]
10732 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1
10733 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
10734 ; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
10735 ; AVX512DQ-FCP-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7]
10736 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm23, %zmm6
10737 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
10738 ; AVX512DQ-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
10739 ; AVX512DQ-FCP-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7]
10740 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
10741 ; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm7, %zmm8, %zmm7
10742 ; AVX512DQ-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload
10743 ; AVX512DQ-FCP-NEXT: # ymm8 = mem[0,1,2,3],ymm9[4,5,6,7]
10744 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
10745 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
10746 ; AVX512DQ-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload
10747 ; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7]
10748 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9
10749 ; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload
10750 ; AVX512DQ-FCP-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7]
10751 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
10752 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11
10753 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 192(%rsi)
10754 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 128(%rsi)
10755 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 64(%rsi)
10756 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rsi)
10757 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 192(%rdx)
10758 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
10759 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%rdx)
10760 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 128(%rdx)
10761 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10762 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 192(%rcx)
10763 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10764 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rcx)
10765 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10766 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%rcx)
10767 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10768 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 128(%rcx)
10769 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10770 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 192(%r8)
10771 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10772 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%r8)
10773 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10774 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%r8)
10775 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10776 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 128(%r8)
10777 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10778 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 192(%r9)
10779 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10780 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%r9)
10781 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10782 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%r9)
10783 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10784 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 128(%r9)
10785 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
10786 ; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
10787 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 192(%rax)
10788 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10789 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rax)
10790 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10791 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%rax)
10792 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10793 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 128(%rax)
10794 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
10795 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 192(%rax)
10796 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rax)
10797 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax)
10798 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 128(%rax)
10799 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
10800 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 128(%rax)
10801 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 192(%rax)
10802 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rax)
10803 ; AVX512DQ-FCP-NEXT: vmovaps %zmm7, 64(%rax)
10804 ; AVX512DQ-FCP-NEXT: addq $3208, %rsp # imm = 0xC88
10805 ; AVX512DQ-FCP-NEXT: vzeroupper
10806 ; AVX512DQ-FCP-NEXT: retq
10808 ; AVX512BW-LABEL: load_i64_stride8_vf32:
10809 ; AVX512BW: # %bb.0:
10810 ; AVX512BW-NEXT: subq $3208, %rsp # imm = 0xC88
10811 ; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm12
10812 ; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm16
10813 ; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm19
10814 ; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm18
10815 ; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm24
10816 ; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm27
10817 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm7
10818 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10819 ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm11
10820 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10821 ; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm31
10822 ; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm3
10823 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm20
10824 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm8
10825 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10826 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm25
10827 ; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10828 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm23
10829 ; AVX512BW-NEXT: movb $-64, %al
10830 ; AVX512BW-NEXT: kmovd %eax, %k1
10831 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10]
10832 ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
10833 ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1
10834 ; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm1
10835 ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2
10836 ; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10837 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2
10838 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
10839 ; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %ymm22
10840 ; AVX512BW-NEXT: vmovdqa 1152(%rdi), %ymm10
10841 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm10[0],ymm22[0],ymm10[2],ymm22[2]
10842 ; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm9
10843 ; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm6
10844 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2]
10845 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3]
10846 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1
10847 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10848 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2
10849 ; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm2
10850 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4
10851 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm4
10852 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1}
10853 ; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm7
10854 ; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm13
10855 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm13[0],ymm7[0],ymm13[2],ymm7[2]
10856 ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %ymm17
10857 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm21
10858 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2]
10859 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3]
10860 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm1
10861 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10862 ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm4
10863 ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm4
10864 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm11
10865 ; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm11
10866 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1}
10867 ; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm4
10868 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm14
10869 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
10870 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm25
10871 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm28
10872 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm25[0],ymm28[2],ymm25[2]
10873 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
10874 ; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm29
10875 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1
10876 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10877 ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm1
10878 ; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1
10879 ; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm26
10880 ; AVX512BW-NEXT: vpermi2q %zmm12, %zmm26, %zmm0
10881 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
10882 ; AVX512BW-NEXT: vmovdqa 1728(%rdi), %ymm2
10883 ; AVX512BW-NEXT: vmovdqa 1664(%rdi), %ymm8
10884 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm2[0],ymm8[2],ymm2[2]
10885 ; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %ymm30
10886 ; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm1
10887 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm30[0],ymm1[2],ymm30[2]
10888 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3]
10889 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0
10890 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10891 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
10892 ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
10893 ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm11
10894 ; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm11
10895 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm18
10896 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1}
10897 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3]
10898 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3]
10899 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3]
10900 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5
10901 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10902 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5
10903 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10
10904 ; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm5
10905 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
10906 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm6
10907 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
10908 ; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm6
10909 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
10910 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm13[1],ymm7[1],ymm13[3],ymm7[3]
10911 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm21[1],ymm17[1],ymm21[3],ymm17[3]
10912 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3]
10913 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3
10914 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10915 ; AVX512BW-NEXT: vmovdqu64 %zmm23, (%rsp) # 64-byte Spill
10916 ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm3
10917 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
10918 ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3
10919 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
10920 ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm5
10921 ; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm5
10922 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1}
10923 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
10924 ; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm7
10925 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3]
10926 ; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm6
10927 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
10928 ; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm13
10929 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3
10930 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10931 ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm15
10932 ; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10933 ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3
10934 ; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10935 ; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm3
10936 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm9
10937 ; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10938 ; AVX512BW-NEXT: vpermi2q %zmm12, %zmm26, %zmm0
10939 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1}
10940 ; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm4
10941 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3]
10942 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3]
10943 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
10944 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
10945 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10946 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
10947 ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
10948 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1
10949 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10950 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm1
10951 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm8
10952 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10953 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2
10954 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5
10955 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10956 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
10957 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10958 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
10959 ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm2
10960 ; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
10961 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
10962 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6]
10963 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
10964 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm25
10965 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
10966 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10967 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm1
10968 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10969 ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm1
10970 ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm26
10971 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm14
10972 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2
10973 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10974 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2
10975 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
10976 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2
10977 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm29
10978 ; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm2
10979 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6]
10980 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm10
10981 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
10982 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
10983 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10984 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm2
10985 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10986 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1
10987 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10988 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
10989 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm6
10990 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10991 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2
10992 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10993 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2
10994 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
10995 ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm23
10996 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6]
10997 ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22
10998 ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm3 {%k1}
10999 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
11000 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11001 ; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm18
11002 ; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm1
11003 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11004 ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm1
11005 ; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm13
11006 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11007 ; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm6
11008 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2
11009 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11010 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm2
11011 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
11012 ; AVX512BW-NEXT: vpermi2q %zmm16, %zmm15, %zmm0
11013 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm9[0],zmm12[0],zmm9[2],zmm12[2],zmm9[4],zmm12[4],zmm9[6],zmm12[6]
11014 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
11015 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0
11016 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11017 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,13,5,13,5,13,5,13]
11018 ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
11019 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm4
11020 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1
11021 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm1
11022 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7]
11023 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [0,8,0,8,0,8,0,8]
11024 ; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
11025 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm9
11026 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0
11027 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm21, %zmm0
11028 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11029 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [1,9,1,9,1,9,1,9]
11030 ; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
11031 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0
11032 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm16, %zmm0
11033 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11034 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14]
11035 ; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
11036 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0
11037 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm23, %zmm0
11038 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
11039 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15]
11040 ; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
11041 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm17, %zmm9
11042 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm27[0],zmm24[0],zmm27[2],zmm24[2],zmm27[4],zmm24[4],zmm27[6],zmm24[6]
11043 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11044 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm27[1],zmm24[1],zmm27[3],zmm24[3],zmm27[5],zmm24[5],zmm27[7],zmm24[7]
11045 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11046 ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0
11047 ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm8
11048 ; AVX512BW-NEXT: vpermt2q %zmm24, %zmm21, %zmm27
11049 ; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11050 ; AVX512BW-NEXT: vpermt2q %zmm24, %zmm16, %zmm0
11051 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11052 ; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm8
11053 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
11054 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm0
11055 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11056 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
11057 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7
11058 ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9
11059 ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm2, %zmm7
11060 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm13
11061 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm14
11062 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm7[4,5,6,7]
11063 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0
11064 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4
11065 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm4
11066 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11067 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4
11068 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm16, %zmm4
11069 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11070 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm30
11071 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm23, %zmm30
11072 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7]
11073 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm17, %zmm0
11074 ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm4
11075 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6]
11076 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7]
11077 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11078 ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0
11079 ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm14
11080 ; AVX512BW-NEXT: vpermt2q %zmm31, %zmm21, %zmm4
11081 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11082 ; AVX512BW-NEXT: vpermt2q %zmm31, %zmm16, %zmm0
11083 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11084 ; AVX512BW-NEXT: vpermt2q %zmm31, %zmm2, %zmm14
11085 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1}
11086 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm0
11087 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11088 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
11089 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
11090 ; AVX512BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm8
11091 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
11092 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11
11093 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm11 # 64-byte Folded Reload
11094 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4,5,6,7]
11095 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0
11096 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm25
11097 ; AVX512BW-NEXT: vpermt2q %zmm20, %zmm21, %zmm25
11098 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm5
11099 ; AVX512BW-NEXT: vpermt2q %zmm20, %zmm16, %zmm5
11100 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11101 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm27
11102 ; AVX512BW-NEXT: vpermt2q %zmm20, %zmm23, %zmm27
11103 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm20[1],zmm22[3],zmm20[3],zmm22[5],zmm20[5],zmm22[7],zmm20[7]
11104 ; AVX512BW-NEXT: vpermt2q %zmm20, %zmm17, %zmm0
11105 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
11106 ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm31 # 64-byte Reload
11107 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6]
11108 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm31[1],zmm1[1],zmm31[3],zmm1[3],zmm31[5],zmm1[5],zmm31[7],zmm1[7]
11109 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11110 ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0
11111 ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm24
11112 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm31
11113 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm0
11114 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11115 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm24
11116 ; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm14 {%k1}
11117 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm0
11118 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11119 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
11120 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm11
11121 ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm11
11122 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
11123 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm6
11124 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm11[4,5,6,7]
11125 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
11126 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
11127 ; AVX512BW-NEXT: vpermi2q %zmm20, %zmm5, %zmm2
11128 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
11129 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7]
11130 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1}
11131 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm2
11132 ; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
11133 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2
11134 ; AVX512BW-NEXT: vpermt2q %zmm20, %zmm21, %zmm2
11135 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11136 ; AVX512BW-NEXT: vpermi2q %zmm12, %zmm22, %zmm21
11137 ; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11138 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2
11139 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm21
11140 ; AVX512BW-NEXT: vpermt2q %zmm20, %zmm16, %zmm2
11141 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11142 ; AVX512BW-NEXT: vpermi2q %zmm12, %zmm22, %zmm16
11143 ; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11144 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16
11145 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
11146 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14
11147 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
11148 ; AVX512BW-NEXT: vpermt2q %zmm24, %zmm23, %zmm14
11149 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
11150 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
11151 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm23, %zmm11
11152 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12
11153 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm23, %zmm12
11154 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm29
11155 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
11156 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8
11157 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm8
11158 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
11159 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10
11160 ; AVX512BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm10
11161 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5
11162 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7
11163 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
11164 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm23, %zmm7
11165 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0
11166 ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm23, %zmm0
11167 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11168 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
11169 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
11170 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm23, %zmm0
11171 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11172 ; AVX512BW-NEXT: vpermi2q %zmm16, %zmm22, %zmm23
11173 ; AVX512BW-NEXT: vpermt2q %zmm16, %zmm17, %zmm22
11174 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6]
11175 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm21[1],zmm20[1],zmm21[3],zmm20[3],zmm21[5],zmm20[5],zmm21[7],zmm20[7]
11176 ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm17, %zmm3
11177 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11178 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm17, %zmm2
11179 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11180 ; AVX512BW-NEXT: vpermt2q %zmm28, %zmm17, %zmm9
11181 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm17, %zmm5
11182 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11183 ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm17, %zmm15
11184 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm17, %zmm1
11185 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11186 ; AVX512BW-NEXT: vpermt2q %zmm24, %zmm17, %zmm6
11187 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11188 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
11189 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm17, %zmm3
11190 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm14[4,5,6,7]
11191 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11192 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14
11193 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm12[4,5,6,7]
11194 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm4
11195 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm10[4,5,6,7]
11196 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm10
11197 ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm25 {%k1}
11198 ; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm0
11199 ; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0
11200 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1
11201 ; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm1, %ymm1
11202 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
11203 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2
11204 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11205 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
11206 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1}
11207 ; AVX512BW-NEXT: vmovdqa 576(%rdi), %xmm5
11208 ; AVX512BW-NEXT: vinserti128 $1, 704(%rdi), %ymm5, %ymm5
11209 ; AVX512BW-NEXT: vmovdqa 512(%rdi), %xmm6
11210 ; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6
11211 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
11212 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm18, %zmm7, %zmm27
11213 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11214 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
11215 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
11216 ; AVX512BW-NEXT: vmovdqa 1088(%rdi), %xmm13
11217 ; AVX512BW-NEXT: vinserti128 $1, 1216(%rdi), %ymm13, %ymm13
11218 ; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %xmm18
11219 ; AVX512BW-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm18, %ymm18
11220 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2]
11221 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm19, %zmm7, %zmm19
11222 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11223 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
11224 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
11225 ; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %xmm21
11226 ; AVX512BW-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm21, %ymm21
11227 ; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %xmm25
11228 ; AVX512BW-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm25, %ymm25
11229 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm21[0],ymm25[2],ymm21[2]
11230 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm26, %zmm7, %zmm12
11231 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11232 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
11233 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
11234 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3]
11235 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm7, %zmm13
11236 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11237 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
11238 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
11239 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
11240 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5
11241 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
11242 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11243 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1}
11244 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
11245 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0
11246 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
11247 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
11248 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1}
11249 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm21[1],ymm25[3],ymm21[3]
11250 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1
11251 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
11252 ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
11253 ; AVX512BW-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7]
11254 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm23, %zmm6
11255 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11256 ; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
11257 ; AVX512BW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7]
11258 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
11259 ; AVX512BW-NEXT: vinsertf64x4 $0, %ymm7, %zmm8, %zmm7
11260 ; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload
11261 ; AVX512BW-NEXT: # ymm8 = mem[0,1,2,3],ymm9[4,5,6,7]
11262 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
11263 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
11264 ; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload
11265 ; AVX512BW-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7]
11266 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9
11267 ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload
11268 ; AVX512BW-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7]
11269 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
11270 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11
11271 ; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rsi)
11272 ; AVX512BW-NEXT: vmovdqa64 %zmm19, 128(%rsi)
11273 ; AVX512BW-NEXT: vmovdqa64 %zmm27, 64(%rsi)
11274 ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rsi)
11275 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rdx)
11276 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
11277 ; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rdx)
11278 ; AVX512BW-NEXT: vmovdqa64 %zmm13, 128(%rdx)
11279 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11280 ; AVX512BW-NEXT: vmovaps %zmm0, 192(%rcx)
11281 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11282 ; AVX512BW-NEXT: vmovaps %zmm0, (%rcx)
11283 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11284 ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rcx)
11285 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11286 ; AVX512BW-NEXT: vmovaps %zmm0, 128(%rcx)
11287 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11288 ; AVX512BW-NEXT: vmovaps %zmm0, 192(%r8)
11289 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11290 ; AVX512BW-NEXT: vmovaps %zmm0, (%r8)
11291 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11292 ; AVX512BW-NEXT: vmovaps %zmm0, 64(%r8)
11293 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11294 ; AVX512BW-NEXT: vmovaps %zmm0, 128(%r8)
11295 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11296 ; AVX512BW-NEXT: vmovaps %zmm0, 192(%r9)
11297 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11298 ; AVX512BW-NEXT: vmovaps %zmm0, (%r9)
11299 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11300 ; AVX512BW-NEXT: vmovaps %zmm0, 64(%r9)
11301 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11302 ; AVX512BW-NEXT: vmovaps %zmm0, 128(%r9)
11303 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
11304 ; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
11305 ; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax)
11306 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11307 ; AVX512BW-NEXT: vmovaps %zmm0, (%rax)
11308 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11309 ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax)
11310 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11311 ; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax)
11312 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
11313 ; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%rax)
11314 ; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rax)
11315 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rax)
11316 ; AVX512BW-NEXT: vmovdqa64 %zmm14, 128(%rax)
11317 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
11318 ; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rax)
11319 ; AVX512BW-NEXT: vmovdqa64 %zmm9, 192(%rax)
11320 ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rax)
11321 ; AVX512BW-NEXT: vmovaps %zmm7, 64(%rax)
11322 ; AVX512BW-NEXT: addq $3208, %rsp # imm = 0xC88
11323 ; AVX512BW-NEXT: vzeroupper
11324 ; AVX512BW-NEXT: retq
11326 ; AVX512BW-FCP-LABEL: load_i64_stride8_vf32:
11327 ; AVX512BW-FCP: # %bb.0:
11328 ; AVX512BW-FCP-NEXT: subq $3208, %rsp # imm = 0xC88
11329 ; AVX512BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm12
11330 ; AVX512BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm16
11331 ; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm19
11332 ; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm18
11333 ; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm24
11334 ; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm27
11335 ; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm7
11336 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11337 ; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11
11338 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11339 ; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm31
11340 ; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm3
11341 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm20
11342 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8
11343 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11344 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm25
11345 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11346 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm23
11347 ; AVX512BW-FCP-NEXT: movb $-64, %al
11348 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1
11349 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10]
11350 ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
11351 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm1
11352 ; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm1
11353 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm2
11354 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11355 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2
11356 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
11357 ; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %ymm22
11358 ; AVX512BW-FCP-NEXT: vmovdqa 1152(%rdi), %ymm10
11359 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm10[0],ymm22[0],ymm10[2],ymm22[2]
11360 ; AVX512BW-FCP-NEXT: vmovdqa 1088(%rdi), %ymm9
11361 ; AVX512BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm6
11362 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2]
11363 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3]
11364 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1
11365 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11366 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2
11367 ; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2
11368 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4
11369 ; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm4
11370 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1}
11371 ; AVX512BW-FCP-NEXT: vmovdqa 704(%rdi), %ymm7
11372 ; AVX512BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm13
11373 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm13[0],ymm7[0],ymm13[2],ymm7[2]
11374 ; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %ymm17
11375 ; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %ymm21
11376 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2]
11377 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3]
11378 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm1
11379 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11380 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm4
11381 ; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm4
11382 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm11
11383 ; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm11
11384 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1}
11385 ; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm4
11386 ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm14
11387 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
11388 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm25
11389 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm28
11390 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm25[0],ymm28[2],ymm25[2]
11391 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
11392 ; AVX512BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm29
11393 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1
11394 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11395 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm1
11396 ; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1
11397 ; AVX512BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26
11398 ; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm26, %zmm0
11399 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
11400 ; AVX512BW-FCP-NEXT: vmovdqa 1728(%rdi), %ymm2
11401 ; AVX512BW-FCP-NEXT: vmovdqa 1664(%rdi), %ymm8
11402 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm2[0],ymm8[2],ymm2[2]
11403 ; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %ymm30
11404 ; AVX512BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm1
11405 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm30[0],ymm1[2],ymm30[2]
11406 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3]
11407 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0
11408 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11409 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
11410 ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
11411 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm11
11412 ; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm11
11413 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm18
11414 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1}
11415 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3]
11416 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3]
11417 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3]
11418 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5
11419 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11420 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5
11421 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
11422 ; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm5
11423 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
11424 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm6
11425 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
11426 ; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm6
11427 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
11428 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm13[1],ymm7[1],ymm13[3],ymm7[3]
11429 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm21[1],ymm17[1],ymm21[3],ymm17[3]
11430 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3]
11431 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3
11432 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11433 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, (%rsp) # 64-byte Spill
11434 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm3
11435 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
11436 ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm3
11437 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
11438 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm5
11439 ; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm5
11440 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1}
11441 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
11442 ; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm7
11443 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3]
11444 ; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6
11445 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
11446 ; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm13
11447 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3
11448 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11449 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm15
11450 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11451 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm3
11452 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11453 ; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm3
11454 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm9
11455 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11456 ; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm26, %zmm0
11457 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1}
11458 ; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm4
11459 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3]
11460 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3]
11461 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
11462 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
11463 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11464 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
11465 ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
11466 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1
11467 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11468 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1
11469 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm8
11470 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11471 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm2
11472 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5
11473 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11474 ; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
11475 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11476 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
11477 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm2
11478 ; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
11479 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
11480 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6]
11481 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
11482 ; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm25
11483 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
11484 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11485 ; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm1
11486 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11487 ; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm1
11488 ; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm26
11489 ; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm14
11490 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2
11491 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11492 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm2
11493 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
11494 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2
11495 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm29
11496 ; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2
11497 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6]
11498 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm10
11499 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
11500 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
11501 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11502 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2
11503 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11504 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
11505 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11506 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
11507 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6
11508 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11509 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
11510 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11511 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2
11512 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
11513 ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm23
11514 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6]
11515 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm22
11516 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 {%k1}
11517 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
11518 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11519 ; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm18
11520 ; AVX512BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm1
11521 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11522 ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm1
11523 ; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm13
11524 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11525 ; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm6
11526 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm2
11527 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11528 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm2
11529 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
11530 ; AVX512BW-FCP-NEXT: vpermi2q %zmm16, %zmm15, %zmm0
11531 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm9[0],zmm12[0],zmm9[2],zmm12[2],zmm9[4],zmm12[4],zmm9[6],zmm12[6]
11532 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
11533 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0
11534 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11535 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,13,5,13,5,13,5,13]
11536 ; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
11537 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm4
11538 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1
11539 ; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm1
11540 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7]
11541 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [0,8,0,8,0,8,0,8]
11542 ; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
11543 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm9
11544 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0
11545 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm0
11546 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11547 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [1,9,1,9,1,9,1,9]
11548 ; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
11549 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0
11550 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm16, %zmm0
11551 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11552 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14]
11553 ; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
11554 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0
11555 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm0
11556 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
11557 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15]
11558 ; AVX512BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
11559 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm9
11560 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm27[0],zmm24[0],zmm27[2],zmm24[2],zmm27[4],zmm24[4],zmm27[6],zmm24[6]
11561 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11562 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm27[1],zmm24[1],zmm27[3],zmm24[3],zmm27[5],zmm24[5],zmm27[7],zmm24[7]
11563 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11564 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0
11565 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm8
11566 ; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm21, %zmm27
11567 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11568 ; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm16, %zmm0
11569 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11570 ; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm8
11571 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
11572 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm0
11573 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11574 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
11575 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7
11576 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm9
11577 ; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm2, %zmm7
11578 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm13
11579 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm14
11580 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm7[4,5,6,7]
11581 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0
11582 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4
11583 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm4
11584 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11585 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4
11586 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm4
11587 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11588 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm30
11589 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm30
11590 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7]
11591 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm17, %zmm0
11592 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm4
11593 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6]
11594 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7]
11595 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11596 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0
11597 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm14
11598 ; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm21, %zmm4
11599 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11600 ; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm16, %zmm0
11601 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11602 ; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm2, %zmm14
11603 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1}
11604 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm0
11605 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11606 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
11607 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
11608 ; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm8
11609 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
11610 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm11
11611 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm11 # 64-byte Folded Reload
11612 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4,5,6,7]
11613 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm0
11614 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm25
11615 ; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm25
11616 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm5
11617 ; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm5
11618 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11619 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm27
11620 ; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm27
11621 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm20[1],zmm22[3],zmm20[3],zmm22[5],zmm20[5],zmm22[7],zmm20[7]
11622 ; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm17, %zmm0
11623 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
11624 ; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm31 # 64-byte Reload
11625 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6]
11626 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm31[1],zmm1[1],zmm31[3],zmm1[3],zmm31[5],zmm1[5],zmm31[7],zmm1[7]
11627 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11628 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0
11629 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm24
11630 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm31
11631 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm0
11632 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11633 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm24
11634 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm14 {%k1}
11635 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm0
11636 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11637 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
11638 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm11
11639 ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm11
11640 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
11641 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm6
11642 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm11[4,5,6,7]
11643 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
11644 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
11645 ; AVX512BW-FCP-NEXT: vpermi2q %zmm20, %zmm5, %zmm2
11646 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
11647 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7]
11648 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1}
11649 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm2
11650 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
11651 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2
11652 ; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm2
11653 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11654 ; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm22, %zmm21
11655 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11656 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2
11657 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm21
11658 ; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm2
11659 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11660 ; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm22, %zmm16
11661 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11662 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16
11663 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
11664 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14
11665 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
11666 ; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm23, %zmm14
11667 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
11668 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
11669 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm11
11670 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12
11671 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm12
11672 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm29
11673 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
11674 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
11675 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm8
11676 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
11677 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10
11678 ; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm10
11679 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5
11680 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7
11681 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
11682 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm23, %zmm7
11683 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0
11684 ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm0
11685 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11686 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
11687 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0
11688 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm0
11689 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11690 ; AVX512BW-FCP-NEXT: vpermi2q %zmm16, %zmm22, %zmm23
11691 ; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm17, %zmm22
11692 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6]
11693 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm21[1],zmm20[1],zmm21[3],zmm20[3],zmm21[5],zmm20[5],zmm21[7],zmm20[7]
11694 ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm17, %zmm3
11695 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11696 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm17, %zmm2
11697 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11698 ; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm17, %zmm9
11699 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm5
11700 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11701 ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm17, %zmm15
11702 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm1
11703 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11704 ; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm17, %zmm6
11705 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11706 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
11707 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm17, %zmm3
11708 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm14[4,5,6,7]
11709 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11710 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14
11711 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm12[4,5,6,7]
11712 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm4
11713 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm10[4,5,6,7]
11714 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm10
11715 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm25 {%k1}
11716 ; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm0
11717 ; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0
11718 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm1
11719 ; AVX512BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm1, %ymm1
11720 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
11721 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2
11722 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11723 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
11724 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1}
11725 ; AVX512BW-FCP-NEXT: vmovdqa 576(%rdi), %xmm5
11726 ; AVX512BW-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm5, %ymm5
11727 ; AVX512BW-FCP-NEXT: vmovdqa 512(%rdi), %xmm6
11728 ; AVX512BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6
11729 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
11730 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm18, %zmm7, %zmm27
11731 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11732 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
11733 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
11734 ; AVX512BW-FCP-NEXT: vmovdqa 1088(%rdi), %xmm13
11735 ; AVX512BW-FCP-NEXT: vinserti128 $1, 1216(%rdi), %ymm13, %ymm13
11736 ; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %xmm18
11737 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm18, %ymm18
11738 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2]
11739 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm7, %zmm19
11740 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11741 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
11742 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
11743 ; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %xmm21
11744 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm21, %ymm21
11745 ; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %xmm25
11746 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm25, %ymm25
11747 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm21[0],ymm25[2],ymm21[2]
11748 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm26, %zmm7, %zmm12
11749 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11750 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
11751 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
11752 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3]
11753 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm7, %zmm13
11754 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11755 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
11756 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
11757 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
11758 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5
11759 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
11760 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11761 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1}
11762 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
11763 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0
11764 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
11765 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
11766 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1}
11767 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm21[1],ymm25[3],ymm21[3]
11768 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1
11769 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
11770 ; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
11771 ; AVX512BW-FCP-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7]
11772 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm23, %zmm6
11773 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11774 ; AVX512BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
11775 ; AVX512BW-FCP-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7]
11776 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
11777 ; AVX512BW-FCP-NEXT: vinsertf64x4 $0, %ymm7, %zmm8, %zmm7
11778 ; AVX512BW-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload
11779 ; AVX512BW-FCP-NEXT: # ymm8 = mem[0,1,2,3],ymm9[4,5,6,7]
11780 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
11781 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
11782 ; AVX512BW-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload
11783 ; AVX512BW-FCP-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7]
11784 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9
11785 ; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload
11786 ; AVX512BW-FCP-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7]
11787 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
11788 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11
11789 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 192(%rsi)
11790 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 128(%rsi)
11791 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 64(%rsi)
11792 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi)
11793 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%rdx)
11794 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
11795 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rdx)
11796 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 128(%rdx)
11797 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11798 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 192(%rcx)
11799 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11800 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%rcx)
11801 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11802 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 64(%rcx)
11803 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11804 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 128(%rcx)
11805 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11806 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 192(%r8)
11807 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11808 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%r8)
11809 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11810 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 64(%r8)
11811 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11812 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 128(%r8)
11813 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11814 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 192(%r9)
11815 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11816 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%r9)
11817 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11818 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 64(%r9)
11819 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11820 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 128(%r9)
11821 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
11822 ; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
11823 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 192(%rax)
11824 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11825 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%rax)
11826 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11827 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 64(%rax)
11828 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11829 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 128(%rax)
11830 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
11831 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 192(%rax)
11832 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rax)
11833 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax)
11834 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 128(%rax)
11835 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
11836 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 128(%rax)
11837 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 192(%rax)
11838 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rax)
11839 ; AVX512BW-FCP-NEXT: vmovaps %zmm7, 64(%rax)
11840 ; AVX512BW-FCP-NEXT: addq $3208, %rsp # imm = 0xC88
11841 ; AVX512BW-FCP-NEXT: vzeroupper
11842 ; AVX512BW-FCP-NEXT: retq
11844 ; AVX512DQ-BW-LABEL: load_i64_stride8_vf32:
11845 ; AVX512DQ-BW: # %bb.0:
11846 ; AVX512DQ-BW-NEXT: subq $3208, %rsp # imm = 0xC88
11847 ; AVX512DQ-BW-NEXT: vmovdqa64 1856(%rdi), %zmm12
11848 ; AVX512DQ-BW-NEXT: vmovdqa64 1984(%rdi), %zmm16
11849 ; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm19
11850 ; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm18
11851 ; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm24
11852 ; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm27
11853 ; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm7
11854 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11855 ; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm11
11856 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11857 ; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm31
11858 ; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm3
11859 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm20
11860 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm8
11861 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11862 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm25
11863 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11864 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm23
11865 ; AVX512DQ-BW-NEXT: movb $-64, %al
11866 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1
11867 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10]
11868 ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
11869 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm1
11870 ; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm1
11871 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm2
11872 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11873 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2
11874 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
11875 ; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %ymm22
11876 ; AVX512DQ-BW-NEXT: vmovdqa 1152(%rdi), %ymm10
11877 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm10[0],ymm22[0],ymm10[2],ymm22[2]
11878 ; AVX512DQ-BW-NEXT: vmovdqa 1088(%rdi), %ymm9
11879 ; AVX512DQ-BW-NEXT: vmovdqa 1024(%rdi), %ymm6
11880 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2]
11881 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3]
11882 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1
11883 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11884 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm2
11885 ; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm2
11886 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm4
11887 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm4
11888 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1}
11889 ; AVX512DQ-BW-NEXT: vmovdqa 704(%rdi), %ymm7
11890 ; AVX512DQ-BW-NEXT: vmovdqa 640(%rdi), %ymm13
11891 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm13[0],ymm7[0],ymm13[2],ymm7[2]
11892 ; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %ymm17
11893 ; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %ymm21
11894 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2]
11895 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3]
11896 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm1
11897 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11898 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm4
11899 ; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm4
11900 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm11
11901 ; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm11
11902 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1}
11903 ; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm4
11904 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm14
11905 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
11906 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %ymm25
11907 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %ymm28
11908 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm25[0],ymm28[2],ymm25[2]
11909 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
11910 ; AVX512DQ-BW-NEXT: vmovdqa64 1920(%rdi), %zmm29
11911 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1
11912 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11913 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm1
11914 ; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1
11915 ; AVX512DQ-BW-NEXT: vmovdqa64 1792(%rdi), %zmm26
11916 ; AVX512DQ-BW-NEXT: vpermi2q %zmm12, %zmm26, %zmm0
11917 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
11918 ; AVX512DQ-BW-NEXT: vmovdqa 1728(%rdi), %ymm2
11919 ; AVX512DQ-BW-NEXT: vmovdqa 1664(%rdi), %ymm8
11920 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm2[0],ymm8[2],ymm2[2]
11921 ; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %ymm30
11922 ; AVX512DQ-BW-NEXT: vmovdqa 1536(%rdi), %ymm1
11923 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm30[0],ymm1[2],ymm30[2]
11924 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3]
11925 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0
11926 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11927 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
11928 ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
11929 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm11
11930 ; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm11
11931 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm18
11932 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1}
11933 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3]
11934 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3]
11935 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3]
11936 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5
11937 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11938 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5
11939 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10
11940 ; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm5
11941 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
11942 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm6
11943 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
11944 ; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm6
11945 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
11946 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm13[1],ymm7[1],ymm13[3],ymm7[3]
11947 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm21[1],ymm17[1],ymm21[3],ymm17[3]
11948 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3]
11949 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3
11950 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11951 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm23, (%rsp) # 64-byte Spill
11952 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm3
11953 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
11954 ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3
11955 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
11956 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm5
11957 ; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm5
11958 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1}
11959 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
11960 ; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm7
11961 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3]
11962 ; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm6
11963 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
11964 ; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm13
11965 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3
11966 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11967 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm15
11968 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11969 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm3
11970 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11971 ; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm3
11972 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm9
11973 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11974 ; AVX512DQ-BW-NEXT: vpermi2q %zmm12, %zmm26, %zmm0
11975 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1}
11976 ; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm4
11977 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3]
11978 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3]
11979 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
11980 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
11981 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11982 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
11983 ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
11984 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1
11985 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11986 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm1
11987 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm8
11988 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11989 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm2
11990 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm5
11991 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11992 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
11993 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11994 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
11995 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm2
11996 ; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
11997 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
11998 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6]
11999 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
12000 ; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm25
12001 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
12002 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12003 ; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm1
12004 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12005 ; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm1
12006 ; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm26
12007 ; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm14
12008 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm2
12009 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12010 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2
12011 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
12012 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm2
12013 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm29
12014 ; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm2
12015 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6]
12016 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm10
12017 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
12018 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
12019 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12020 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm2
12021 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12022 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm1
12023 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12024 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
12025 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm6
12026 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12027 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2
12028 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12029 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2
12030 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
12031 ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm23
12032 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6]
12033 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm22
12034 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm3 {%k1}
12035 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
12036 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12037 ; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm18
12038 ; AVX512DQ-BW-NEXT: vmovdqa64 1664(%rdi), %zmm1
12039 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12040 ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm1
12041 ; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm13
12042 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12043 ; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %zmm6
12044 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm2
12045 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12046 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm2
12047 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
12048 ; AVX512DQ-BW-NEXT: vpermi2q %zmm16, %zmm15, %zmm0
12049 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm9[0],zmm12[0],zmm9[2],zmm12[2],zmm9[4],zmm12[4],zmm9[6],zmm12[6]
12050 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
12051 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0
12052 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12053 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,13,5,13,5,13,5,13]
12054 ; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
12055 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm4
12056 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1
12057 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm1
12058 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7]
12059 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [0,8,0,8,0,8,0,8]
12060 ; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
12061 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm9
12062 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm0
12063 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm21, %zmm0
12064 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12065 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [1,9,1,9,1,9,1,9]
12066 ; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
12067 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm0
12068 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm16, %zmm0
12069 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12070 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14]
12071 ; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
12072 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm0
12073 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm23, %zmm0
12074 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
12075 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15]
12076 ; AVX512DQ-BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
12077 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm17, %zmm9
12078 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm27[0],zmm24[0],zmm27[2],zmm24[2],zmm27[4],zmm24[4],zmm27[6],zmm24[6]
12079 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12080 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm27[1],zmm24[1],zmm27[3],zmm24[3],zmm27[5],zmm24[5],zmm27[7],zmm24[7]
12081 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12082 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm0
12083 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm8
12084 ; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm21, %zmm27
12085 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12086 ; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm16, %zmm0
12087 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12088 ; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm8
12089 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
12090 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm0
12091 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12092 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
12093 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm7
12094 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm9
12095 ; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm2, %zmm7
12096 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm13
12097 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm14
12098 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm7[4,5,6,7]
12099 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0
12100 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm4
12101 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm4
12102 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12103 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm4
12104 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm16, %zmm4
12105 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12106 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm30
12107 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm23, %zmm30
12108 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7]
12109 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm17, %zmm0
12110 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm4
12111 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6]
12112 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7]
12113 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12114 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm0
12115 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm14
12116 ; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm21, %zmm4
12117 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12118 ; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm16, %zmm0
12119 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12120 ; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm2, %zmm14
12121 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1}
12122 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm0
12123 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12124 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
12125 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
12126 ; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm8
12127 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
12128 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm11
12129 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm11 # 64-byte Folded Reload
12130 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4,5,6,7]
12131 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm0
12132 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm25
12133 ; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm21, %zmm25
12134 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm5
12135 ; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm16, %zmm5
12136 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12137 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm27
12138 ; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm23, %zmm27
12139 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm20[1],zmm22[3],zmm20[3],zmm22[5],zmm20[5],zmm22[7],zmm20[7]
12140 ; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm17, %zmm0
12141 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
12142 ; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm31 # 64-byte Reload
12143 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6]
12144 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm31[1],zmm1[1],zmm31[3],zmm1[3],zmm31[5],zmm1[5],zmm31[7],zmm1[7]
12145 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12146 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm0
12147 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm24
12148 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm31
12149 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm0
12150 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12151 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm24
12152 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm14 {%k1}
12153 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm0
12154 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12155 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
12156 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm11
12157 ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm11
12158 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
12159 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm6
12160 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm11[4,5,6,7]
12161 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
12162 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
12163 ; AVX512DQ-BW-NEXT: vpermi2q %zmm20, %zmm5, %zmm2
12164 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
12165 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7]
12166 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1}
12167 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm2
12168 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
12169 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm2
12170 ; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm21, %zmm2
12171 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12172 ; AVX512DQ-BW-NEXT: vpermi2q %zmm12, %zmm22, %zmm21
12173 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12174 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm2
12175 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm21
12176 ; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm16, %zmm2
12177 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12178 ; AVX512DQ-BW-NEXT: vpermi2q %zmm12, %zmm22, %zmm16
12179 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12180 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm16
12181 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
12182 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm14
12183 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
12184 ; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm23, %zmm14
12185 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
12186 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
12187 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm23, %zmm11
12188 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm12
12189 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm23, %zmm12
12190 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm29
12191 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
12192 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8
12193 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm8
12194 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
12195 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm10
12196 ; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm10
12197 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5
12198 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm7
12199 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
12200 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm23, %zmm7
12201 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm0
12202 ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm23, %zmm0
12203 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12204 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
12205 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0
12206 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm23, %zmm0
12207 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12208 ; AVX512DQ-BW-NEXT: vpermi2q %zmm16, %zmm22, %zmm23
12209 ; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm17, %zmm22
12210 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6]
12211 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm21[1],zmm20[1],zmm21[3],zmm20[3],zmm21[5],zmm20[5],zmm21[7],zmm20[7]
12212 ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm17, %zmm3
12213 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12214 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm17, %zmm2
12215 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12216 ; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm17, %zmm9
12217 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm17, %zmm5
12218 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12219 ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm17, %zmm15
12220 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm17, %zmm1
12221 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12222 ; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm17, %zmm6
12223 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12224 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
12225 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm17, %zmm3
12226 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm14[4,5,6,7]
12227 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12228 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14
12229 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm12[4,5,6,7]
12230 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm4
12231 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm10[4,5,6,7]
12232 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm10
12233 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm25 {%k1}
12234 ; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm0
12235 ; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0
12236 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm1
12237 ; AVX512DQ-BW-NEXT: vinserti128 $1, 128(%rdi), %ymm1, %ymm1
12238 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
12239 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2
12240 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
12241 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
12242 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1}
12243 ; AVX512DQ-BW-NEXT: vmovdqa 576(%rdi), %xmm5
12244 ; AVX512DQ-BW-NEXT: vinserti128 $1, 704(%rdi), %ymm5, %ymm5
12245 ; AVX512DQ-BW-NEXT: vmovdqa 512(%rdi), %xmm6
12246 ; AVX512DQ-BW-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6
12247 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
12248 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm18, %zmm7, %zmm27
12249 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
12250 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
12251 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
12252 ; AVX512DQ-BW-NEXT: vmovdqa 1088(%rdi), %xmm13
12253 ; AVX512DQ-BW-NEXT: vinserti128 $1, 1216(%rdi), %ymm13, %ymm13
12254 ; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %xmm18
12255 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm18, %ymm18
12256 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2]
12257 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm19, %zmm7, %zmm19
12258 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
12259 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
12260 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
12261 ; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %xmm21
12262 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm21, %ymm21
12263 ; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %xmm25
12264 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm25, %ymm25
12265 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm21[0],ymm25[2],ymm21[2]
12266 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm26, %zmm7, %zmm12
12267 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
12268 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
12269 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
12270 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3]
12271 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm7, %zmm13
12272 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
12273 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
12274 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
12275 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
12276 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5
12277 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
12278 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
12279 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1}
12280 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
12281 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0
12282 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
12283 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
12284 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1}
12285 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm21[1],ymm25[3],ymm21[3]
12286 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1
12287 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
12288 ; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
12289 ; AVX512DQ-BW-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7]
12290 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm23, %zmm6
12291 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
12292 ; AVX512DQ-BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
12293 ; AVX512DQ-BW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7]
12294 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
12295 ; AVX512DQ-BW-NEXT: vinsertf64x4 $0, %ymm7, %zmm8, %zmm7
12296 ; AVX512DQ-BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload
12297 ; AVX512DQ-BW-NEXT: # ymm8 = mem[0,1,2,3],ymm9[4,5,6,7]
12298 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
12299 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
12300 ; AVX512DQ-BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload
12301 ; AVX512DQ-BW-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7]
12302 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9
12303 ; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload
12304 ; AVX512DQ-BW-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7]
12305 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
12306 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11
12307 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 192(%rsi)
12308 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 128(%rsi)
12309 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 64(%rsi)
12310 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rsi)
12311 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 192(%rdx)
12312 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rdx)
12313 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%rdx)
12314 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 128(%rdx)
12315 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12316 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 192(%rcx)
12317 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12318 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%rcx)
12319 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12320 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 64(%rcx)
12321 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12322 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 128(%rcx)
12323 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12324 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 192(%r8)
12325 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12326 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%r8)
12327 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12328 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 64(%r8)
12329 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12330 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 128(%r8)
12331 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12332 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 192(%r9)
12333 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12334 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%r9)
12335 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12336 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 64(%r9)
12337 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12338 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 128(%r9)
12339 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
12340 ; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
12341 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 192(%rax)
12342 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12343 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%rax)
12344 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12345 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 64(%rax)
12346 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12347 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 128(%rax)
12348 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
12349 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 192(%rax)
12350 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rax)
12351 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 64(%rax)
12352 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 128(%rax)
12353 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
12354 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 128(%rax)
12355 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 192(%rax)
12356 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rax)
12357 ; AVX512DQ-BW-NEXT: vmovaps %zmm7, 64(%rax)
12358 ; AVX512DQ-BW-NEXT: addq $3208, %rsp # imm = 0xC88
12359 ; AVX512DQ-BW-NEXT: vzeroupper
12360 ; AVX512DQ-BW-NEXT: retq
12362 ; AVX512DQ-BW-FCP-LABEL: load_i64_stride8_vf32:
12363 ; AVX512DQ-BW-FCP: # %bb.0:
12364 ; AVX512DQ-BW-FCP-NEXT: subq $3208, %rsp # imm = 0xC88
12365 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm12
12366 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm16
12367 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm19
12368 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm18
12369 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm24
12370 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm27
12371 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm7
12372 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12373 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11
12374 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12375 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm31
12376 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm3
12377 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm20
12378 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8
12379 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12380 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm25
12381 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12382 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm23
12383 ; AVX512DQ-BW-FCP-NEXT: movb $-64, %al
12384 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
12385 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10]
12386 ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
12387 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm1
12388 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm1
12389 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm2
12390 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12391 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2
12392 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
12393 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %ymm22
12394 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1152(%rdi), %ymm10
12395 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm10[0],ymm22[0],ymm10[2],ymm22[2]
12396 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1088(%rdi), %ymm9
12397 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm6
12398 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2]
12399 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3]
12400 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1
12401 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12402 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2
12403 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2
12404 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4
12405 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm4
12406 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1}
12407 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 704(%rdi), %ymm7
12408 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm13
12409 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm13[0],ymm7[0],ymm13[2],ymm7[2]
12410 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %ymm17
12411 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %ymm21
12412 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2]
12413 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3]
12414 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm1
12415 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12416 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm4
12417 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm4
12418 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm11
12419 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm11
12420 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1}
12421 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm4
12422 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm14
12423 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
12424 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm25
12425 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm28
12426 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm25[0],ymm28[2],ymm25[2]
12427 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
12428 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm29
12429 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1
12430 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12431 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm1
12432 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1
12433 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26
12434 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm26, %zmm0
12435 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
12436 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1728(%rdi), %ymm2
12437 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1664(%rdi), %ymm8
12438 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm2[0],ymm8[2],ymm2[2]
12439 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %ymm30
12440 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm1
12441 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm30[0],ymm1[2],ymm30[2]
12442 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3]
12443 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0
12444 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12445 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
12446 ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
12447 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm11
12448 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm11
12449 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm18
12450 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1}
12451 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3]
12452 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3]
12453 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3]
12454 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5
12455 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12456 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5
12457 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
12458 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm5
12459 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
12460 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm6
12461 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
12462 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm6
12463 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
12464 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm13[1],ymm7[1],ymm13[3],ymm7[3]
12465 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm21[1],ymm17[1],ymm21[3],ymm17[3]
12466 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3]
12467 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3
12468 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12469 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, (%rsp) # 64-byte Spill
12470 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm3
12471 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
12472 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm3
12473 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
12474 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm5
12475 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm5
12476 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1}
12477 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
12478 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm7
12479 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3]
12480 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6
12481 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
12482 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm13
12483 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3
12484 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12485 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm15
12486 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12487 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm3
12488 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12489 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm3
12490 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm9
12491 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12492 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm26, %zmm0
12493 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1}
12494 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm4
12495 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3]
12496 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3]
12497 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
12498 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
12499 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12500 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
12501 ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
12502 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1
12503 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12504 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1
12505 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm8
12506 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12507 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm2
12508 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5
12509 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12510 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
12511 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12512 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
12513 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm2
12514 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
12515 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
12516 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6]
12517 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
12518 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm25
12519 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
12520 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12521 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm1
12522 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12523 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm1
12524 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm26
12525 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm14
12526 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2
12527 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12528 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm2
12529 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
12530 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2
12531 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm29
12532 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2
12533 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6]
12534 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm10
12535 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
12536 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
12537 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12538 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2
12539 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12540 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
12541 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12542 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
12543 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6
12544 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12545 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
12546 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12547 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2
12548 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
12549 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm23
12550 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6]
12551 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm22
12552 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 {%k1}
12553 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
12554 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12555 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm18
12556 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm1
12557 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12558 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm1
12559 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm13
12560 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12561 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm6
12562 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm2
12563 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12564 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm2
12565 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
12566 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm16, %zmm15, %zmm0
12567 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm9[0],zmm12[0],zmm9[2],zmm12[2],zmm9[4],zmm12[4],zmm9[6],zmm12[6]
12568 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
12569 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0
12570 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12571 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,13,5,13,5,13,5,13]
12572 ; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
12573 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm4
12574 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1
12575 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm1
12576 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7]
12577 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [0,8,0,8,0,8,0,8]
12578 ; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
12579 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm9
12580 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0
12581 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm0
12582 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12583 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [1,9,1,9,1,9,1,9]
12584 ; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
12585 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0
12586 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm16, %zmm0
12587 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12588 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14]
12589 ; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
12590 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0
12591 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm0
12592 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
12593 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15]
12594 ; AVX512DQ-BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
12595 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm9
12596 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm27[0],zmm24[0],zmm27[2],zmm24[2],zmm27[4],zmm24[4],zmm27[6],zmm24[6]
12597 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12598 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm27[1],zmm24[1],zmm27[3],zmm24[3],zmm27[5],zmm24[5],zmm27[7],zmm24[7]
12599 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12600 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0
12601 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm8
12602 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm21, %zmm27
12603 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12604 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm16, %zmm0
12605 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12606 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm8
12607 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
12608 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm0
12609 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12610 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
12611 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7
12612 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm9
12613 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm2, %zmm7
12614 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm13
12615 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm14
12616 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm7[4,5,6,7]
12617 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0
12618 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4
12619 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm4
12620 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12621 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4
12622 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm4
12623 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12624 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm30
12625 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm30
12626 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7]
12627 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm17, %zmm0
12628 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm4
12629 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6]
12630 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7]
12631 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12632 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0
12633 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm14
12634 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm21, %zmm4
12635 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12636 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm16, %zmm0
12637 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12638 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm2, %zmm14
12639 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1}
12640 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm0
12641 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12642 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
12643 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
12644 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm8
12645 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
12646 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm11
12647 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm11 # 64-byte Folded Reload
12648 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4,5,6,7]
12649 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm0
12650 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm25
12651 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm25
12652 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm5
12653 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm5
12654 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12655 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm27
12656 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm27
12657 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm20[1],zmm22[3],zmm20[3],zmm22[5],zmm20[5],zmm22[7],zmm20[7]
12658 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm17, %zmm0
12659 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
12660 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm31 # 64-byte Reload
12661 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6]
12662 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm31[1],zmm1[1],zmm31[3],zmm1[3],zmm31[5],zmm1[5],zmm31[7],zmm1[7]
12663 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12664 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0
12665 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm24
12666 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm31
12667 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm0
12668 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12669 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm24
12670 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm14 {%k1}
12671 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm0
12672 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12673 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
12674 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm11
12675 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm11
12676 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
12677 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm6
12678 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm11[4,5,6,7]
12679 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
12680 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
12681 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm20, %zmm5, %zmm2
12682 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
12683 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7]
12684 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1}
12685 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm2
12686 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
12687 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2
12688 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm2
12689 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12690 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm22, %zmm21
12691 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12692 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2
12693 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm21
12694 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm2
12695 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12696 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm22, %zmm16
12697 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12698 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16
12699 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
12700 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14
12701 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
12702 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm23, %zmm14
12703 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
12704 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
12705 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm11
12706 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12
12707 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm12
12708 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm29
12709 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
12710 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
12711 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm8
12712 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
12713 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10
12714 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm10
12715 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5
12716 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7
12717 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
12718 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm23, %zmm7
12719 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0
12720 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm0
12721 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12722 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
12723 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0
12724 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm0
12725 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12726 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm16, %zmm22, %zmm23
12727 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm17, %zmm22
12728 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6]
12729 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm21[1],zmm20[1],zmm21[3],zmm20[3],zmm21[5],zmm20[5],zmm21[7],zmm20[7]
12730 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm17, %zmm3
12731 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12732 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm17, %zmm2
12733 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12734 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm17, %zmm9
12735 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm5
12736 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12737 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm17, %zmm15
12738 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm1
12739 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12740 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm17, %zmm6
12741 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12742 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
12743 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm17, %zmm3
12744 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm14[4,5,6,7]
12745 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12746 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14
12747 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm12[4,5,6,7]
12748 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm4
12749 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm10[4,5,6,7]
12750 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm10
12751 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm25 {%k1}
12752 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm0
12753 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0
12754 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm1
12755 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm1, %ymm1
12756 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
12757 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2
12758 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
12759 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
12760 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1}
12761 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 576(%rdi), %xmm5
12762 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm5, %ymm5
12763 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 512(%rdi), %xmm6
12764 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6
12765 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
12766 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm18, %zmm7, %zmm27
12767 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
12768 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
12769 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
12770 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1088(%rdi), %xmm13
12771 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1216(%rdi), %ymm13, %ymm13
12772 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %xmm18
12773 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm18, %ymm18
12774 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2]
12775 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm7, %zmm19
12776 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
12777 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
12778 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
12779 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %xmm21
12780 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm21, %ymm21
12781 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %xmm25
12782 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm25, %ymm25
12783 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm21[0],ymm25[2],ymm21[2]
12784 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm26, %zmm7, %zmm12
12785 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
12786 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
12787 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
12788 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3]
12789 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm7, %zmm13
12790 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
12791 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
12792 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
12793 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
12794 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5
12795 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
12796 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
12797 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1}
12798 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
12799 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0
12800 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
12801 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
12802 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1}
12803 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm21[1],ymm25[3],ymm21[3]
12804 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1
12805 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
12806 ; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
12807 ; AVX512DQ-BW-FCP-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7]
12808 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm23, %zmm6
12809 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
12810 ; AVX512DQ-BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
12811 ; AVX512DQ-BW-FCP-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7]
12812 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
12813 ; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $0, %ymm7, %zmm8, %zmm7
12814 ; AVX512DQ-BW-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload
12815 ; AVX512DQ-BW-FCP-NEXT: # ymm8 = mem[0,1,2,3],ymm9[4,5,6,7]
12816 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
12817 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
12818 ; AVX512DQ-BW-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload
12819 ; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7]
12820 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9
12821 ; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload
12822 ; AVX512DQ-BW-FCP-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7]
12823 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
12824 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11
12825 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 192(%rsi)
12826 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 128(%rsi)
12827 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 64(%rsi)
12828 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi)
12829 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%rdx)
12830 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
12831 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rdx)
12832 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 128(%rdx)
12833 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12834 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 192(%rcx)
12835 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12836 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%rcx)
12837 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12838 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 64(%rcx)
12839 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12840 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 128(%rcx)
12841 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12842 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 192(%r8)
12843 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12844 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%r8)
12845 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12846 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 64(%r8)
12847 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12848 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 128(%r8)
12849 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12850 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 192(%r9)
12851 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12852 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%r9)
12853 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12854 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 64(%r9)
12855 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12856 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 128(%r9)
12857 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
12858 ; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
12859 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 192(%rax)
12860 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12861 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%rax)
12862 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12863 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 64(%rax)
12864 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12865 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 128(%rax)
12866 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
12867 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 192(%rax)
12868 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rax)
12869 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax)
12870 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 128(%rax)
12871 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
12872 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 128(%rax)
12873 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 192(%rax)
12874 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rax)
12875 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm7, 64(%rax)
12876 ; AVX512DQ-BW-FCP-NEXT: addq $3208, %rsp # imm = 0xC88
12877 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
12878 ; AVX512DQ-BW-FCP-NEXT: retq
12879 %wide.vec = load <256 x i64>, ptr %in.vec, align 64
12880 %strided.vec0 = shufflevector <256 x i64> %wide.vec, <256 x i64> poison, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120, i32 128, i32 136, i32 144, i32 152, i32 160, i32 168, i32 176, i32 184, i32 192, i32 200, i32 208, i32 216, i32 224, i32 232, i32 240, i32 248>
12881 %strided.vec1 = shufflevector <256 x i64> %wide.vec, <256 x i64> poison, <32 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 65, i32 73, i32 81, i32 89, i32 97, i32 105, i32 113, i32 121, i32 129, i32 137, i32 145, i32 153, i32 161, i32 169, i32 177, i32 185, i32 193, i32 201, i32 209, i32 217, i32 225, i32 233, i32 241, i32 249>
12882 %strided.vec2 = shufflevector <256 x i64> %wide.vec, <256 x i64> poison, <32 x i32> <i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58, i32 66, i32 74, i32 82, i32 90, i32 98, i32 106, i32 114, i32 122, i32 130, i32 138, i32 146, i32 154, i32 162, i32 170, i32 178, i32 186, i32 194, i32 202, i32 210, i32 218, i32 226, i32 234, i32 242, i32 250>
12883 %strided.vec3 = shufflevector <256 x i64> %wide.vec, <256 x i64> poison, <32 x i32> <i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59, i32 67, i32 75, i32 83, i32 91, i32 99, i32 107, i32 115, i32 123, i32 131, i32 139, i32 147, i32 155, i32 163, i32 171, i32 179, i32 187, i32 195, i32 203, i32 211, i32 219, i32 227, i32 235, i32 243, i32 251>
12884 %strided.vec4 = shufflevector <256 x i64> %wide.vec, <256 x i64> poison, <32 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60, i32 68, i32 76, i32 84, i32 92, i32 100, i32 108, i32 116, i32 124, i32 132, i32 140, i32 148, i32 156, i32 164, i32 172, i32 180, i32 188, i32 196, i32 204, i32 212, i32 220, i32 228, i32 236, i32 244, i32 252>
12885 %strided.vec5 = shufflevector <256 x i64> %wide.vec, <256 x i64> poison, <32 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 69, i32 77, i32 85, i32 93, i32 101, i32 109, i32 117, i32 125, i32 133, i32 141, i32 149, i32 157, i32 165, i32 173, i32 181, i32 189, i32 197, i32 205, i32 213, i32 221, i32 229, i32 237, i32 245, i32 253>
12886 %strided.vec6 = shufflevector <256 x i64> %wide.vec, <256 x i64> poison, <32 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 70, i32 78, i32 86, i32 94, i32 102, i32 110, i32 118, i32 126, i32 134, i32 142, i32 150, i32 158, i32 166, i32 174, i32 182, i32 190, i32 198, i32 206, i32 214, i32 222, i32 230, i32 238, i32 246, i32 254>
12887 %strided.vec7 = shufflevector <256 x i64> %wide.vec, <256 x i64> poison, <32 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63, i32 71, i32 79, i32 87, i32 95, i32 103, i32 111, i32 119, i32 127, i32 135, i32 143, i32 151, i32 159, i32 167, i32 175, i32 183, i32 191, i32 199, i32 207, i32 215, i32 223, i32 231, i32 239, i32 247, i32 255>
12888 store <32 x i64> %strided.vec0, ptr %out.vec0, align 64
12889 store <32 x i64> %strided.vec1, ptr %out.vec1, align 64
12890 store <32 x i64> %strided.vec2, ptr %out.vec2, align 64
12891 store <32 x i64> %strided.vec3, ptr %out.vec3, align 64
12892 store <32 x i64> %strided.vec4, ptr %out.vec4, align 64
12893 store <32 x i64> %strided.vec5, ptr %out.vec5, align 64
12894 store <32 x i64> %strided.vec6, ptr %out.vec6, align 64
12895 store <32 x i64> %strided.vec7, ptr %out.vec7, align 64
12899 define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind {
12900 ; SSE-LABEL: load_i64_stride8_vf64:
12902 ; SSE-NEXT: subq $3720, %rsp # imm = 0xE88
12903 ; SSE-NEXT: movaps 960(%rdi), %xmm0
12904 ; SSE-NEXT: movaps 832(%rdi), %xmm1
12905 ; SSE-NEXT: movaps 768(%rdi), %xmm8
12906 ; SSE-NEXT: movaps 704(%rdi), %xmm2
12907 ; SSE-NEXT: movaps 640(%rdi), %xmm9
12908 ; SSE-NEXT: movaps 576(%rdi), %xmm3
12909 ; SSE-NEXT: movaps 512(%rdi), %xmm10
12910 ; SSE-NEXT: movaps 448(%rdi), %xmm4
12911 ; SSE-NEXT: movaps 384(%rdi), %xmm11
12912 ; SSE-NEXT: movaps 320(%rdi), %xmm5
12913 ; SSE-NEXT: movaps 256(%rdi), %xmm12
12914 ; SSE-NEXT: movaps 192(%rdi), %xmm6
12915 ; SSE-NEXT: movaps 128(%rdi), %xmm13
12916 ; SSE-NEXT: movaps 64(%rdi), %xmm7
12917 ; SSE-NEXT: movaps (%rdi), %xmm14
12918 ; SSE-NEXT: movaps %xmm14, %xmm15
12919 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm7[0]
12920 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12921 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm7[1]
12922 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12923 ; SSE-NEXT: movaps %xmm13, %xmm7
12924 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm6[0]
12925 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12926 ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm6[1]
12927 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12928 ; SSE-NEXT: movaps %xmm12, %xmm6
12929 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0]
12930 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12931 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1]
12932 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12933 ; SSE-NEXT: movaps %xmm11, %xmm5
12934 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0]
12935 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12936 ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm4[1]
12937 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12938 ; SSE-NEXT: movaps %xmm10, %xmm4
12939 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0]
12940 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12941 ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm3[1]
12942 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12943 ; SSE-NEXT: movaps %xmm9, %xmm3
12944 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
12945 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12946 ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm2[1]
12947 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12948 ; SSE-NEXT: movaps %xmm8, %xmm2
12949 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
12950 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12951 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1]
12952 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12953 ; SSE-NEXT: movaps 896(%rdi), %xmm1
12954 ; SSE-NEXT: movaps %xmm1, %xmm2
12955 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
12956 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12957 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
12958 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12959 ; SSE-NEXT: movaps 1088(%rdi), %xmm0
12960 ; SSE-NEXT: movaps 1024(%rdi), %xmm1
12961 ; SSE-NEXT: movaps %xmm1, %xmm2
12962 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
12963 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12964 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
12965 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12966 ; SSE-NEXT: movaps 1216(%rdi), %xmm0
12967 ; SSE-NEXT: movaps 1152(%rdi), %xmm1
12968 ; SSE-NEXT: movaps %xmm1, %xmm2
12969 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
12970 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12971 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
12972 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12973 ; SSE-NEXT: movaps 1344(%rdi), %xmm0
12974 ; SSE-NEXT: movaps 1280(%rdi), %xmm1
12975 ; SSE-NEXT: movaps %xmm1, %xmm2
12976 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
12977 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12978 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
12979 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12980 ; SSE-NEXT: movaps 1472(%rdi), %xmm0
12981 ; SSE-NEXT: movaps 1408(%rdi), %xmm1
12982 ; SSE-NEXT: movaps %xmm1, %xmm2
12983 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
12984 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12985 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
12986 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12987 ; SSE-NEXT: movaps 1600(%rdi), %xmm0
12988 ; SSE-NEXT: movaps 1536(%rdi), %xmm1
12989 ; SSE-NEXT: movaps %xmm1, %xmm2
12990 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
12991 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12992 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
12993 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12994 ; SSE-NEXT: movaps 1728(%rdi), %xmm0
12995 ; SSE-NEXT: movaps 1664(%rdi), %xmm1
12996 ; SSE-NEXT: movaps %xmm1, %xmm2
12997 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
12998 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12999 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13000 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13001 ; SSE-NEXT: movaps 1856(%rdi), %xmm0
13002 ; SSE-NEXT: movaps 1792(%rdi), %xmm1
13003 ; SSE-NEXT: movaps %xmm1, %xmm2
13004 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13005 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13006 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13007 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13008 ; SSE-NEXT: movaps 1984(%rdi), %xmm0
13009 ; SSE-NEXT: movaps 1920(%rdi), %xmm1
13010 ; SSE-NEXT: movaps %xmm1, %xmm2
13011 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13012 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13013 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13014 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13015 ; SSE-NEXT: movaps 2112(%rdi), %xmm0
13016 ; SSE-NEXT: movaps 2048(%rdi), %xmm1
13017 ; SSE-NEXT: movaps %xmm1, %xmm2
13018 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13019 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13020 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13021 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13022 ; SSE-NEXT: movaps 2240(%rdi), %xmm0
13023 ; SSE-NEXT: movaps 2176(%rdi), %xmm1
13024 ; SSE-NEXT: movaps %xmm1, %xmm2
13025 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13026 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13027 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13028 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13029 ; SSE-NEXT: movaps 2368(%rdi), %xmm0
13030 ; SSE-NEXT: movaps 2304(%rdi), %xmm1
13031 ; SSE-NEXT: movaps %xmm1, %xmm2
13032 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13033 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13034 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13035 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13036 ; SSE-NEXT: movaps 2496(%rdi), %xmm0
13037 ; SSE-NEXT: movaps 2432(%rdi), %xmm1
13038 ; SSE-NEXT: movaps %xmm1, %xmm2
13039 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13040 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13041 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13042 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13043 ; SSE-NEXT: movaps 2624(%rdi), %xmm0
13044 ; SSE-NEXT: movaps 2560(%rdi), %xmm1
13045 ; SSE-NEXT: movaps %xmm1, %xmm2
13046 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13047 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13048 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13049 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13050 ; SSE-NEXT: movaps 2752(%rdi), %xmm0
13051 ; SSE-NEXT: movaps 2688(%rdi), %xmm1
13052 ; SSE-NEXT: movaps %xmm1, %xmm2
13053 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13054 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13055 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13056 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13057 ; SSE-NEXT: movaps 2880(%rdi), %xmm0
13058 ; SSE-NEXT: movaps 2816(%rdi), %xmm1
13059 ; SSE-NEXT: movaps %xmm1, %xmm2
13060 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13061 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13062 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13063 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13064 ; SSE-NEXT: movaps 3008(%rdi), %xmm0
13065 ; SSE-NEXT: movaps 2944(%rdi), %xmm1
13066 ; SSE-NEXT: movaps %xmm1, %xmm2
13067 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13068 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13069 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13070 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13071 ; SSE-NEXT: movaps 3136(%rdi), %xmm0
13072 ; SSE-NEXT: movaps 3072(%rdi), %xmm1
13073 ; SSE-NEXT: movaps %xmm1, %xmm2
13074 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13075 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13076 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13077 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13078 ; SSE-NEXT: movaps 3264(%rdi), %xmm0
13079 ; SSE-NEXT: movaps 3200(%rdi), %xmm1
13080 ; SSE-NEXT: movaps %xmm1, %xmm2
13081 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13082 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13083 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13084 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13085 ; SSE-NEXT: movaps 3392(%rdi), %xmm0
13086 ; SSE-NEXT: movaps 3328(%rdi), %xmm1
13087 ; SSE-NEXT: movaps %xmm1, %xmm2
13088 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13089 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13090 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13091 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13092 ; SSE-NEXT: movaps 3520(%rdi), %xmm0
13093 ; SSE-NEXT: movaps 3456(%rdi), %xmm1
13094 ; SSE-NEXT: movaps %xmm1, %xmm2
13095 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13096 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13097 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13098 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13099 ; SSE-NEXT: movaps 3648(%rdi), %xmm0
13100 ; SSE-NEXT: movaps 3584(%rdi), %xmm1
13101 ; SSE-NEXT: movaps %xmm1, %xmm2
13102 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13103 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13104 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13105 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13106 ; SSE-NEXT: movaps 3776(%rdi), %xmm0
13107 ; SSE-NEXT: movaps 3712(%rdi), %xmm1
13108 ; SSE-NEXT: movaps %xmm1, %xmm2
13109 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13110 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13111 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13112 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13113 ; SSE-NEXT: movaps 3904(%rdi), %xmm0
13114 ; SSE-NEXT: movaps 3840(%rdi), %xmm1
13115 ; SSE-NEXT: movaps %xmm1, %xmm2
13116 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13117 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13118 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13119 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13120 ; SSE-NEXT: movaps 4032(%rdi), %xmm0
13121 ; SSE-NEXT: movaps 3968(%rdi), %xmm1
13122 ; SSE-NEXT: movaps %xmm1, %xmm7
13123 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0]
13124 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13125 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13126 ; SSE-NEXT: movaps 80(%rdi), %xmm0
13127 ; SSE-NEXT: movaps 16(%rdi), %xmm1
13128 ; SSE-NEXT: movaps %xmm1, %xmm2
13129 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13130 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13131 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13132 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13133 ; SSE-NEXT: movaps 208(%rdi), %xmm0
13134 ; SSE-NEXT: movaps 144(%rdi), %xmm1
13135 ; SSE-NEXT: movaps %xmm1, %xmm2
13136 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13137 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13138 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13139 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13140 ; SSE-NEXT: movaps 336(%rdi), %xmm0
13141 ; SSE-NEXT: movaps 272(%rdi), %xmm1
13142 ; SSE-NEXT: movaps %xmm1, %xmm2
13143 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13144 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13145 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13146 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13147 ; SSE-NEXT: movaps 464(%rdi), %xmm0
13148 ; SSE-NEXT: movaps 400(%rdi), %xmm1
13149 ; SSE-NEXT: movaps %xmm1, %xmm2
13150 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13151 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13152 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13153 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13154 ; SSE-NEXT: movaps 592(%rdi), %xmm0
13155 ; SSE-NEXT: movaps 528(%rdi), %xmm1
13156 ; SSE-NEXT: movaps %xmm1, %xmm2
13157 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13158 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13159 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13160 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13161 ; SSE-NEXT: movaps 720(%rdi), %xmm0
13162 ; SSE-NEXT: movaps 656(%rdi), %xmm1
13163 ; SSE-NEXT: movaps %xmm1, %xmm2
13164 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13165 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13166 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13167 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13168 ; SSE-NEXT: movaps 848(%rdi), %xmm0
13169 ; SSE-NEXT: movaps 784(%rdi), %xmm1
13170 ; SSE-NEXT: movaps %xmm1, %xmm2
13171 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13172 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13173 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13174 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13175 ; SSE-NEXT: movaps 976(%rdi), %xmm0
13176 ; SSE-NEXT: movaps 912(%rdi), %xmm1
13177 ; SSE-NEXT: movaps %xmm1, %xmm2
13178 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13179 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13180 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13181 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13182 ; SSE-NEXT: movaps 1104(%rdi), %xmm0
13183 ; SSE-NEXT: movaps 1040(%rdi), %xmm1
13184 ; SSE-NEXT: movaps %xmm1, %xmm2
13185 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13186 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13187 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13188 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13189 ; SSE-NEXT: movaps 1232(%rdi), %xmm0
13190 ; SSE-NEXT: movaps 1168(%rdi), %xmm1
13191 ; SSE-NEXT: movaps %xmm1, %xmm2
13192 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13193 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13194 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13195 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13196 ; SSE-NEXT: movaps 1360(%rdi), %xmm0
13197 ; SSE-NEXT: movaps 1296(%rdi), %xmm1
13198 ; SSE-NEXT: movaps %xmm1, %xmm2
13199 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13200 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13201 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13202 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13203 ; SSE-NEXT: movaps 1488(%rdi), %xmm0
13204 ; SSE-NEXT: movaps 1424(%rdi), %xmm1
13205 ; SSE-NEXT: movaps %xmm1, %xmm2
13206 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13207 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13208 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13209 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13210 ; SSE-NEXT: movaps 1616(%rdi), %xmm0
13211 ; SSE-NEXT: movaps 1552(%rdi), %xmm1
13212 ; SSE-NEXT: movaps %xmm1, %xmm2
13213 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13214 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13215 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13216 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13217 ; SSE-NEXT: movaps 1744(%rdi), %xmm0
13218 ; SSE-NEXT: movaps 1680(%rdi), %xmm1
13219 ; SSE-NEXT: movaps %xmm1, %xmm2
13220 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13221 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13222 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13223 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13224 ; SSE-NEXT: movaps 1872(%rdi), %xmm0
13225 ; SSE-NEXT: movaps 1808(%rdi), %xmm1
13226 ; SSE-NEXT: movaps %xmm1, %xmm2
13227 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13228 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13229 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13230 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13231 ; SSE-NEXT: movaps 2000(%rdi), %xmm0
13232 ; SSE-NEXT: movaps 1936(%rdi), %xmm1
13233 ; SSE-NEXT: movaps %xmm1, %xmm2
13234 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13235 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13236 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13237 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13238 ; SSE-NEXT: movaps 2128(%rdi), %xmm0
13239 ; SSE-NEXT: movaps 2064(%rdi), %xmm1
13240 ; SSE-NEXT: movaps %xmm1, %xmm2
13241 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13242 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13243 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13244 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13245 ; SSE-NEXT: movaps 2256(%rdi), %xmm0
13246 ; SSE-NEXT: movaps 2192(%rdi), %xmm1
13247 ; SSE-NEXT: movaps %xmm1, %xmm2
13248 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13249 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13250 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13251 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13252 ; SSE-NEXT: movaps 2384(%rdi), %xmm0
13253 ; SSE-NEXT: movaps 2320(%rdi), %xmm1
13254 ; SSE-NEXT: movaps %xmm1, %xmm2
13255 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13256 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13257 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13258 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13259 ; SSE-NEXT: movaps 2512(%rdi), %xmm0
13260 ; SSE-NEXT: movaps 2448(%rdi), %xmm1
13261 ; SSE-NEXT: movaps %xmm1, %xmm2
13262 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13263 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13264 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13265 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13266 ; SSE-NEXT: movaps 2640(%rdi), %xmm0
13267 ; SSE-NEXT: movaps 2576(%rdi), %xmm1
13268 ; SSE-NEXT: movaps %xmm1, %xmm2
13269 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13270 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13271 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13272 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13273 ; SSE-NEXT: movaps 2768(%rdi), %xmm0
13274 ; SSE-NEXT: movaps 2704(%rdi), %xmm1
13275 ; SSE-NEXT: movaps %xmm1, %xmm2
13276 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13277 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13278 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13279 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13280 ; SSE-NEXT: movaps 2896(%rdi), %xmm0
13281 ; SSE-NEXT: movaps 2832(%rdi), %xmm1
13282 ; SSE-NEXT: movaps %xmm1, %xmm2
13283 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13284 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13285 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13286 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13287 ; SSE-NEXT: movaps 3024(%rdi), %xmm0
13288 ; SSE-NEXT: movaps 2960(%rdi), %xmm1
13289 ; SSE-NEXT: movaps %xmm1, %xmm2
13290 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13291 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13292 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13293 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13294 ; SSE-NEXT: movaps 3152(%rdi), %xmm0
13295 ; SSE-NEXT: movaps 3088(%rdi), %xmm1
13296 ; SSE-NEXT: movaps %xmm1, %xmm2
13297 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13298 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13299 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13300 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13301 ; SSE-NEXT: movaps 3280(%rdi), %xmm0
13302 ; SSE-NEXT: movaps 3216(%rdi), %xmm1
13303 ; SSE-NEXT: movaps %xmm1, %xmm2
13304 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13305 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13306 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13307 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13308 ; SSE-NEXT: movaps 3408(%rdi), %xmm0
13309 ; SSE-NEXT: movaps 3344(%rdi), %xmm1
13310 ; SSE-NEXT: movaps %xmm1, %xmm2
13311 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13312 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13313 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13314 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13315 ; SSE-NEXT: movaps 3536(%rdi), %xmm0
13316 ; SSE-NEXT: movaps 3472(%rdi), %xmm1
13317 ; SSE-NEXT: movaps %xmm1, %xmm2
13318 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13319 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13320 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13321 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13322 ; SSE-NEXT: movaps 3664(%rdi), %xmm0
13323 ; SSE-NEXT: movaps 3600(%rdi), %xmm1
13324 ; SSE-NEXT: movaps %xmm1, %xmm2
13325 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13326 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13327 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13328 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13329 ; SSE-NEXT: movaps 3792(%rdi), %xmm0
13330 ; SSE-NEXT: movaps 3728(%rdi), %xmm1
13331 ; SSE-NEXT: movaps %xmm1, %xmm2
13332 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13333 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13334 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13335 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13336 ; SSE-NEXT: movaps 3920(%rdi), %xmm0
13337 ; SSE-NEXT: movaps 3856(%rdi), %xmm1
13338 ; SSE-NEXT: movaps %xmm1, %xmm2
13339 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13340 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13341 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13342 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13343 ; SSE-NEXT: movaps 4048(%rdi), %xmm0
13344 ; SSE-NEXT: movaps 3984(%rdi), %xmm1
13345 ; SSE-NEXT: movaps %xmm1, %xmm2
13346 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13347 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13348 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13349 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13350 ; SSE-NEXT: movaps 96(%rdi), %xmm0
13351 ; SSE-NEXT: movaps 32(%rdi), %xmm1
13352 ; SSE-NEXT: movaps %xmm1, %xmm2
13353 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13354 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13355 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13356 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13357 ; SSE-NEXT: movaps 224(%rdi), %xmm0
13358 ; SSE-NEXT: movaps 160(%rdi), %xmm1
13359 ; SSE-NEXT: movaps %xmm1, %xmm2
13360 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13361 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13362 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13363 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13364 ; SSE-NEXT: movaps 352(%rdi), %xmm0
13365 ; SSE-NEXT: movaps 288(%rdi), %xmm1
13366 ; SSE-NEXT: movaps %xmm1, %xmm2
13367 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13368 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13369 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13370 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13371 ; SSE-NEXT: movaps 480(%rdi), %xmm0
13372 ; SSE-NEXT: movaps 416(%rdi), %xmm1
13373 ; SSE-NEXT: movaps %xmm1, %xmm2
13374 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13375 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13376 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13377 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13378 ; SSE-NEXT: movaps 608(%rdi), %xmm0
13379 ; SSE-NEXT: movaps 544(%rdi), %xmm1
13380 ; SSE-NEXT: movaps %xmm1, %xmm2
13381 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13382 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13383 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13384 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13385 ; SSE-NEXT: movaps 736(%rdi), %xmm0
13386 ; SSE-NEXT: movaps 672(%rdi), %xmm1
13387 ; SSE-NEXT: movaps %xmm1, %xmm2
13388 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13389 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13390 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13391 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13392 ; SSE-NEXT: movaps 864(%rdi), %xmm0
13393 ; SSE-NEXT: movaps 800(%rdi), %xmm1
13394 ; SSE-NEXT: movaps %xmm1, %xmm2
13395 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13396 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13397 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13398 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13399 ; SSE-NEXT: movaps 992(%rdi), %xmm0
13400 ; SSE-NEXT: movaps 928(%rdi), %xmm1
13401 ; SSE-NEXT: movaps %xmm1, %xmm2
13402 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13403 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13404 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13405 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13406 ; SSE-NEXT: movaps 1120(%rdi), %xmm0
13407 ; SSE-NEXT: movaps 1056(%rdi), %xmm1
13408 ; SSE-NEXT: movaps %xmm1, %xmm2
13409 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13410 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13411 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13412 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13413 ; SSE-NEXT: movaps 1248(%rdi), %xmm0
13414 ; SSE-NEXT: movaps 1184(%rdi), %xmm1
13415 ; SSE-NEXT: movaps %xmm1, %xmm2
13416 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13417 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13418 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13419 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13420 ; SSE-NEXT: movaps 1376(%rdi), %xmm0
13421 ; SSE-NEXT: movaps 1312(%rdi), %xmm1
13422 ; SSE-NEXT: movaps %xmm1, %xmm2
13423 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13424 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13425 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13426 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13427 ; SSE-NEXT: movaps 1504(%rdi), %xmm0
13428 ; SSE-NEXT: movaps 1440(%rdi), %xmm1
13429 ; SSE-NEXT: movaps %xmm1, %xmm2
13430 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13431 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13432 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13433 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13434 ; SSE-NEXT: movaps 1632(%rdi), %xmm0
13435 ; SSE-NEXT: movaps 1568(%rdi), %xmm1
13436 ; SSE-NEXT: movaps %xmm1, %xmm2
13437 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13438 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13439 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13440 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13441 ; SSE-NEXT: movaps 1760(%rdi), %xmm0
13442 ; SSE-NEXT: movaps 1696(%rdi), %xmm1
13443 ; SSE-NEXT: movaps %xmm1, %xmm2
13444 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13445 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13446 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13447 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13448 ; SSE-NEXT: movaps 1888(%rdi), %xmm0
13449 ; SSE-NEXT: movaps 1824(%rdi), %xmm1
13450 ; SSE-NEXT: movaps %xmm1, %xmm2
13451 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13452 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13453 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13454 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13455 ; SSE-NEXT: movaps 2016(%rdi), %xmm0
13456 ; SSE-NEXT: movaps 1952(%rdi), %xmm1
13457 ; SSE-NEXT: movaps %xmm1, %xmm2
13458 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13459 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13460 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13461 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13462 ; SSE-NEXT: movaps 2144(%rdi), %xmm0
13463 ; SSE-NEXT: movaps 2080(%rdi), %xmm1
13464 ; SSE-NEXT: movaps %xmm1, %xmm2
13465 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13466 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13467 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13468 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13469 ; SSE-NEXT: movaps 2272(%rdi), %xmm0
13470 ; SSE-NEXT: movaps 2208(%rdi), %xmm1
13471 ; SSE-NEXT: movaps %xmm1, %xmm2
13472 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13473 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13474 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13475 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13476 ; SSE-NEXT: movaps 2400(%rdi), %xmm0
13477 ; SSE-NEXT: movaps 2336(%rdi), %xmm1
13478 ; SSE-NEXT: movaps %xmm1, %xmm2
13479 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13480 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13481 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13482 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13483 ; SSE-NEXT: movaps 2528(%rdi), %xmm0
13484 ; SSE-NEXT: movaps 2464(%rdi), %xmm1
13485 ; SSE-NEXT: movaps %xmm1, %xmm2
13486 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13487 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13488 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13489 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13490 ; SSE-NEXT: movaps 2656(%rdi), %xmm0
13491 ; SSE-NEXT: movaps 2592(%rdi), %xmm1
13492 ; SSE-NEXT: movaps %xmm1, %xmm2
13493 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13494 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13495 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13496 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13497 ; SSE-NEXT: movaps 2784(%rdi), %xmm0
13498 ; SSE-NEXT: movaps 2720(%rdi), %xmm1
13499 ; SSE-NEXT: movaps %xmm1, %xmm2
13500 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13501 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13502 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13503 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13504 ; SSE-NEXT: movaps 2912(%rdi), %xmm0
13505 ; SSE-NEXT: movaps 2848(%rdi), %xmm1
13506 ; SSE-NEXT: movaps %xmm1, %xmm2
13507 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13508 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13509 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13510 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13511 ; SSE-NEXT: movaps 3040(%rdi), %xmm0
13512 ; SSE-NEXT: movaps 2976(%rdi), %xmm1
13513 ; SSE-NEXT: movaps %xmm1, %xmm2
13514 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13515 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13516 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13517 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13518 ; SSE-NEXT: movaps 3168(%rdi), %xmm0
13519 ; SSE-NEXT: movaps 3104(%rdi), %xmm1
13520 ; SSE-NEXT: movaps %xmm1, %xmm2
13521 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13522 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13523 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13524 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13525 ; SSE-NEXT: movaps 3296(%rdi), %xmm0
13526 ; SSE-NEXT: movaps 3232(%rdi), %xmm1
13527 ; SSE-NEXT: movaps %xmm1, %xmm2
13528 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13529 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13530 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13531 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13532 ; SSE-NEXT: movaps 3424(%rdi), %xmm0
13533 ; SSE-NEXT: movaps 3360(%rdi), %xmm1
13534 ; SSE-NEXT: movaps %xmm1, %xmm2
13535 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13536 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13537 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13538 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13539 ; SSE-NEXT: movaps 3552(%rdi), %xmm0
13540 ; SSE-NEXT: movaps 3488(%rdi), %xmm1
13541 ; SSE-NEXT: movaps %xmm1, %xmm2
13542 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13543 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13544 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13545 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13546 ; SSE-NEXT: movaps 3680(%rdi), %xmm0
13547 ; SSE-NEXT: movaps 3616(%rdi), %xmm1
13548 ; SSE-NEXT: movaps %xmm1, %xmm2
13549 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13550 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13551 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13552 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13553 ; SSE-NEXT: movaps 3808(%rdi), %xmm0
13554 ; SSE-NEXT: movaps 3744(%rdi), %xmm1
13555 ; SSE-NEXT: movaps %xmm1, %xmm2
13556 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13557 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13558 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13559 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13560 ; SSE-NEXT: movaps 3936(%rdi), %xmm0
13561 ; SSE-NEXT: movaps 3872(%rdi), %xmm1
13562 ; SSE-NEXT: movaps %xmm1, %xmm2
13563 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13564 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13565 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13566 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13567 ; SSE-NEXT: movaps 4064(%rdi), %xmm0
13568 ; SSE-NEXT: movaps 4000(%rdi), %xmm1
13569 ; SSE-NEXT: movaps %xmm1, %xmm2
13570 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13571 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13572 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13573 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13574 ; SSE-NEXT: movaps 112(%rdi), %xmm0
13575 ; SSE-NEXT: movaps 48(%rdi), %xmm1
13576 ; SSE-NEXT: movaps %xmm1, %xmm2
13577 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13578 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13579 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13580 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13581 ; SSE-NEXT: movaps 240(%rdi), %xmm0
13582 ; SSE-NEXT: movaps 176(%rdi), %xmm1
13583 ; SSE-NEXT: movaps %xmm1, %xmm2
13584 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13585 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13586 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13587 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13588 ; SSE-NEXT: movaps 368(%rdi), %xmm0
13589 ; SSE-NEXT: movaps 304(%rdi), %xmm1
13590 ; SSE-NEXT: movaps %xmm1, %xmm2
13591 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13592 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13593 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13594 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13595 ; SSE-NEXT: movaps 496(%rdi), %xmm0
13596 ; SSE-NEXT: movaps 432(%rdi), %xmm1
13597 ; SSE-NEXT: movaps %xmm1, %xmm2
13598 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13599 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13600 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13601 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13602 ; SSE-NEXT: movaps 624(%rdi), %xmm0
13603 ; SSE-NEXT: movaps 560(%rdi), %xmm1
13604 ; SSE-NEXT: movaps %xmm1, %xmm2
13605 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13606 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13607 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13608 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13609 ; SSE-NEXT: movaps 752(%rdi), %xmm0
13610 ; SSE-NEXT: movaps 688(%rdi), %xmm1
13611 ; SSE-NEXT: movaps %xmm1, %xmm2
13612 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13613 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13614 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13615 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13616 ; SSE-NEXT: movaps 880(%rdi), %xmm0
13617 ; SSE-NEXT: movaps 816(%rdi), %xmm1
13618 ; SSE-NEXT: movaps %xmm1, %xmm2
13619 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13620 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13621 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13622 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13623 ; SSE-NEXT: movaps 1008(%rdi), %xmm0
13624 ; SSE-NEXT: movaps 944(%rdi), %xmm1
13625 ; SSE-NEXT: movaps %xmm1, %xmm2
13626 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13627 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13628 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13629 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13630 ; SSE-NEXT: movaps 1136(%rdi), %xmm0
13631 ; SSE-NEXT: movaps 1072(%rdi), %xmm1
13632 ; SSE-NEXT: movaps %xmm1, %xmm2
13633 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13634 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13635 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13636 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13637 ; SSE-NEXT: movaps 1264(%rdi), %xmm0
13638 ; SSE-NEXT: movaps 1200(%rdi), %xmm1
13639 ; SSE-NEXT: movaps %xmm1, %xmm2
13640 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13641 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13642 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13643 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13644 ; SSE-NEXT: movaps 1392(%rdi), %xmm0
13645 ; SSE-NEXT: movaps 1328(%rdi), %xmm1
13646 ; SSE-NEXT: movaps %xmm1, %xmm2
13647 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13648 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13649 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13650 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13651 ; SSE-NEXT: movaps 1520(%rdi), %xmm0
13652 ; SSE-NEXT: movaps 1456(%rdi), %xmm1
13653 ; SSE-NEXT: movaps %xmm1, %xmm2
13654 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13655 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13656 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13657 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13658 ; SSE-NEXT: movaps 1648(%rdi), %xmm0
13659 ; SSE-NEXT: movaps 1584(%rdi), %xmm1
13660 ; SSE-NEXT: movaps %xmm1, %xmm2
13661 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13662 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13663 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13664 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13665 ; SSE-NEXT: movaps 1776(%rdi), %xmm0
13666 ; SSE-NEXT: movaps 1712(%rdi), %xmm1
13667 ; SSE-NEXT: movaps %xmm1, %xmm2
13668 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13669 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13670 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13671 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13672 ; SSE-NEXT: movaps 1904(%rdi), %xmm0
13673 ; SSE-NEXT: movaps 1840(%rdi), %xmm1
13674 ; SSE-NEXT: movaps %xmm1, %xmm2
13675 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13676 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13677 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13678 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13679 ; SSE-NEXT: movaps 2032(%rdi), %xmm0
13680 ; SSE-NEXT: movaps 1968(%rdi), %xmm1
13681 ; SSE-NEXT: movaps %xmm1, %xmm2
13682 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13683 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13684 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13685 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13686 ; SSE-NEXT: movaps 2160(%rdi), %xmm0
13687 ; SSE-NEXT: movaps 2096(%rdi), %xmm1
13688 ; SSE-NEXT: movaps %xmm1, %xmm2
13689 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13690 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13691 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13692 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13693 ; SSE-NEXT: movaps 2288(%rdi), %xmm0
13694 ; SSE-NEXT: movaps 2224(%rdi), %xmm1
13695 ; SSE-NEXT: movaps %xmm1, %xmm2
13696 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13697 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13698 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13699 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13700 ; SSE-NEXT: movaps 2416(%rdi), %xmm0
13701 ; SSE-NEXT: movaps 2352(%rdi), %xmm1
13702 ; SSE-NEXT: movaps %xmm1, %xmm2
13703 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13704 ; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill
13705 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13706 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13707 ; SSE-NEXT: movaps 2544(%rdi), %xmm0
13708 ; SSE-NEXT: movaps 2480(%rdi), %xmm1
13709 ; SSE-NEXT: movaps %xmm1, %xmm2
13710 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13711 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13712 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13713 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13714 ; SSE-NEXT: movaps 2672(%rdi), %xmm0
13715 ; SSE-NEXT: movaps 2608(%rdi), %xmm1
13716 ; SSE-NEXT: movaps %xmm1, %xmm2
13717 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13718 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13719 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13720 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13721 ; SSE-NEXT: movaps 2800(%rdi), %xmm0
13722 ; SSE-NEXT: movaps 2736(%rdi), %xmm1
13723 ; SSE-NEXT: movaps %xmm1, %xmm2
13724 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
13725 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13726 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
13727 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13728 ; SSE-NEXT: movaps 2928(%rdi), %xmm0
13729 ; SSE-NEXT: movaps 2864(%rdi), %xmm14
13730 ; SSE-NEXT: movaps %xmm14, %xmm1
13731 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
13732 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13733 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1]
13734 ; SSE-NEXT: movaps 3056(%rdi), %xmm0
13735 ; SSE-NEXT: movaps 2992(%rdi), %xmm11
13736 ; SSE-NEXT: movaps %xmm11, %xmm1
13737 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
13738 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13739 ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1]
13740 ; SSE-NEXT: movaps 3184(%rdi), %xmm0
13741 ; SSE-NEXT: movaps 3120(%rdi), %xmm15
13742 ; SSE-NEXT: movaps %xmm15, %xmm1
13743 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
13744 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13745 ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1]
13746 ; SSE-NEXT: movaps 3312(%rdi), %xmm0
13747 ; SSE-NEXT: movaps 3248(%rdi), %xmm9
13748 ; SSE-NEXT: movaps %xmm9, %xmm1
13749 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
13750 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13751 ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1]
13752 ; SSE-NEXT: movaps 3440(%rdi), %xmm0
13753 ; SSE-NEXT: movaps 3376(%rdi), %xmm12
13754 ; SSE-NEXT: movaps %xmm12, %xmm1
13755 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
13756 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13757 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1]
13758 ; SSE-NEXT: movaps 3568(%rdi), %xmm0
13759 ; SSE-NEXT: movaps 3504(%rdi), %xmm8
13760 ; SSE-NEXT: movaps %xmm8, %xmm13
13761 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0]
13762 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1]
13763 ; SSE-NEXT: movaps 4016(%rdi), %xmm4
13764 ; SSE-NEXT: movaps 3952(%rdi), %xmm3
13765 ; SSE-NEXT: movaps 3696(%rdi), %xmm0
13766 ; SSE-NEXT: movaps 3632(%rdi), %xmm2
13767 ; SSE-NEXT: movaps %xmm2, %xmm10
13768 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0]
13769 ; SSE-NEXT: movaps 4080(%rdi), %xmm1
13770 ; SSE-NEXT: movaps 3888(%rdi), %xmm5
13771 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
13772 ; SSE-NEXT: movaps 3824(%rdi), %xmm6
13773 ; SSE-NEXT: movaps 3760(%rdi), %xmm0
13774 ; SSE-NEXT: movaps %xmm7, 496(%rsi)
13775 ; SSE-NEXT: movaps %xmm0, %xmm7
13776 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm6[0]
13777 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1]
13778 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
13779 ; SSE-NEXT: movaps %xmm6, 480(%rsi)
13780 ; SSE-NEXT: movaps %xmm5, %xmm6
13781 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0]
13782 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1]
13783 ; SSE-NEXT: movaps %xmm4, %xmm3
13784 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0]
13785 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
13786 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13787 ; SSE-NEXT: movaps %xmm1, 464(%rsi)
13788 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13789 ; SSE-NEXT: movaps %xmm1, 448(%rsi)
13790 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13791 ; SSE-NEXT: movaps %xmm1, 432(%rsi)
13792 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13793 ; SSE-NEXT: movaps %xmm1, 416(%rsi)
13794 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13795 ; SSE-NEXT: movaps %xmm1, 400(%rsi)
13796 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13797 ; SSE-NEXT: movaps %xmm1, 384(%rsi)
13798 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13799 ; SSE-NEXT: movaps %xmm1, 368(%rsi)
13800 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13801 ; SSE-NEXT: movaps %xmm1, 352(%rsi)
13802 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13803 ; SSE-NEXT: movaps %xmm1, 336(%rsi)
13804 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13805 ; SSE-NEXT: movaps %xmm1, 320(%rsi)
13806 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13807 ; SSE-NEXT: movaps %xmm1, 304(%rsi)
13808 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13809 ; SSE-NEXT: movaps %xmm1, 288(%rsi)
13810 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13811 ; SSE-NEXT: movaps %xmm1, 272(%rsi)
13812 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13813 ; SSE-NEXT: movaps %xmm1, 256(%rsi)
13814 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13815 ; SSE-NEXT: movaps %xmm1, 240(%rsi)
13816 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13817 ; SSE-NEXT: movaps %xmm1, 224(%rsi)
13818 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13819 ; SSE-NEXT: movaps %xmm1, 208(%rsi)
13820 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13821 ; SSE-NEXT: movaps %xmm1, 192(%rsi)
13822 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13823 ; SSE-NEXT: movaps %xmm1, 176(%rsi)
13824 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13825 ; SSE-NEXT: movaps %xmm1, 160(%rsi)
13826 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13827 ; SSE-NEXT: movaps %xmm1, 144(%rsi)
13828 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13829 ; SSE-NEXT: movaps %xmm1, 128(%rsi)
13830 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13831 ; SSE-NEXT: movaps %xmm1, 112(%rsi)
13832 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13833 ; SSE-NEXT: movaps %xmm1, 96(%rsi)
13834 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13835 ; SSE-NEXT: movaps %xmm1, 80(%rsi)
13836 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13837 ; SSE-NEXT: movaps %xmm1, 64(%rsi)
13838 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13839 ; SSE-NEXT: movaps %xmm1, 48(%rsi)
13840 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13841 ; SSE-NEXT: movaps %xmm1, 32(%rsi)
13842 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13843 ; SSE-NEXT: movaps %xmm1, 16(%rsi)
13844 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13845 ; SSE-NEXT: movaps %xmm1, (%rsi)
13846 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13847 ; SSE-NEXT: movaps %xmm1, 496(%rdx)
13848 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13849 ; SSE-NEXT: movaps %xmm1, 480(%rdx)
13850 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13851 ; SSE-NEXT: movaps %xmm1, 464(%rdx)
13852 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13853 ; SSE-NEXT: movaps %xmm1, 448(%rdx)
13854 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13855 ; SSE-NEXT: movaps %xmm1, 432(%rdx)
13856 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13857 ; SSE-NEXT: movaps %xmm1, 416(%rdx)
13858 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13859 ; SSE-NEXT: movaps %xmm1, 400(%rdx)
13860 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13861 ; SSE-NEXT: movaps %xmm1, 384(%rdx)
13862 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13863 ; SSE-NEXT: movaps %xmm1, 368(%rdx)
13864 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13865 ; SSE-NEXT: movaps %xmm1, 352(%rdx)
13866 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13867 ; SSE-NEXT: movaps %xmm1, 336(%rdx)
13868 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13869 ; SSE-NEXT: movaps %xmm1, 320(%rdx)
13870 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13871 ; SSE-NEXT: movaps %xmm1, 304(%rdx)
13872 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13873 ; SSE-NEXT: movaps %xmm1, 288(%rdx)
13874 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13875 ; SSE-NEXT: movaps %xmm1, 272(%rdx)
13876 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13877 ; SSE-NEXT: movaps %xmm1, 256(%rdx)
13878 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13879 ; SSE-NEXT: movaps %xmm1, 240(%rdx)
13880 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13881 ; SSE-NEXT: movaps %xmm1, 224(%rdx)
13882 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13883 ; SSE-NEXT: movaps %xmm1, 208(%rdx)
13884 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13885 ; SSE-NEXT: movaps %xmm1, 192(%rdx)
13886 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13887 ; SSE-NEXT: movaps %xmm1, 176(%rdx)
13888 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13889 ; SSE-NEXT: movaps %xmm1, 160(%rdx)
13890 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13891 ; SSE-NEXT: movaps %xmm1, 144(%rdx)
13892 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13893 ; SSE-NEXT: movaps %xmm1, 128(%rdx)
13894 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13895 ; SSE-NEXT: movaps %xmm1, 112(%rdx)
13896 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13897 ; SSE-NEXT: movaps %xmm1, 96(%rdx)
13898 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13899 ; SSE-NEXT: movaps %xmm1, 80(%rdx)
13900 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13901 ; SSE-NEXT: movaps %xmm1, 64(%rdx)
13902 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13903 ; SSE-NEXT: movaps %xmm1, 48(%rdx)
13904 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13905 ; SSE-NEXT: movaps %xmm1, 32(%rdx)
13906 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13907 ; SSE-NEXT: movaps %xmm1, 16(%rdx)
13908 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13909 ; SSE-NEXT: movaps %xmm1, (%rdx)
13910 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13911 ; SSE-NEXT: movaps %xmm1, 496(%rcx)
13912 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13913 ; SSE-NEXT: movaps %xmm1, 480(%rcx)
13914 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13915 ; SSE-NEXT: movaps %xmm1, 464(%rcx)
13916 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13917 ; SSE-NEXT: movaps %xmm1, 448(%rcx)
13918 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13919 ; SSE-NEXT: movaps %xmm1, 432(%rcx)
13920 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13921 ; SSE-NEXT: movaps %xmm1, 416(%rcx)
13922 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13923 ; SSE-NEXT: movaps %xmm1, 400(%rcx)
13924 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13925 ; SSE-NEXT: movaps %xmm1, 384(%rcx)
13926 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13927 ; SSE-NEXT: movaps %xmm1, 368(%rcx)
13928 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13929 ; SSE-NEXT: movaps %xmm1, 352(%rcx)
13930 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13931 ; SSE-NEXT: movaps %xmm1, 336(%rcx)
13932 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13933 ; SSE-NEXT: movaps %xmm1, 320(%rcx)
13934 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13935 ; SSE-NEXT: movaps %xmm1, 304(%rcx)
13936 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13937 ; SSE-NEXT: movaps %xmm1, 288(%rcx)
13938 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13939 ; SSE-NEXT: movaps %xmm1, 272(%rcx)
13940 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13941 ; SSE-NEXT: movaps %xmm1, 256(%rcx)
13942 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13943 ; SSE-NEXT: movaps %xmm1, 240(%rcx)
13944 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13945 ; SSE-NEXT: movaps %xmm1, 224(%rcx)
13946 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13947 ; SSE-NEXT: movaps %xmm1, 208(%rcx)
13948 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13949 ; SSE-NEXT: movaps %xmm1, 192(%rcx)
13950 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13951 ; SSE-NEXT: movaps %xmm1, 176(%rcx)
13952 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13953 ; SSE-NEXT: movaps %xmm1, 160(%rcx)
13954 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13955 ; SSE-NEXT: movaps %xmm1, 144(%rcx)
13956 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13957 ; SSE-NEXT: movaps %xmm1, 128(%rcx)
13958 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13959 ; SSE-NEXT: movaps %xmm1, 112(%rcx)
13960 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13961 ; SSE-NEXT: movaps %xmm1, 96(%rcx)
13962 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13963 ; SSE-NEXT: movaps %xmm1, 80(%rcx)
13964 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13965 ; SSE-NEXT: movaps %xmm1, 64(%rcx)
13966 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13967 ; SSE-NEXT: movaps %xmm1, 48(%rcx)
13968 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13969 ; SSE-NEXT: movaps %xmm1, 32(%rcx)
13970 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13971 ; SSE-NEXT: movaps %xmm1, 16(%rcx)
13972 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13973 ; SSE-NEXT: movaps %xmm1, (%rcx)
13974 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13975 ; SSE-NEXT: movaps %xmm1, 496(%r8)
13976 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13977 ; SSE-NEXT: movaps %xmm1, 480(%r8)
13978 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13979 ; SSE-NEXT: movaps %xmm1, 464(%r8)
13980 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13981 ; SSE-NEXT: movaps %xmm1, 448(%r8)
13982 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13983 ; SSE-NEXT: movaps %xmm1, 432(%r8)
13984 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13985 ; SSE-NEXT: movaps %xmm1, 416(%r8)
13986 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13987 ; SSE-NEXT: movaps %xmm1, 400(%r8)
13988 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13989 ; SSE-NEXT: movaps %xmm1, 384(%r8)
13990 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13991 ; SSE-NEXT: movaps %xmm1, 368(%r8)
13992 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13993 ; SSE-NEXT: movaps %xmm1, 352(%r8)
13994 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13995 ; SSE-NEXT: movaps %xmm1, 336(%r8)
13996 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13997 ; SSE-NEXT: movaps %xmm1, 320(%r8)
13998 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13999 ; SSE-NEXT: movaps %xmm1, 304(%r8)
14000 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14001 ; SSE-NEXT: movaps %xmm1, 288(%r8)
14002 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14003 ; SSE-NEXT: movaps %xmm1, 272(%r8)
14004 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14005 ; SSE-NEXT: movaps %xmm1, 256(%r8)
14006 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14007 ; SSE-NEXT: movaps %xmm1, 240(%r8)
14008 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14009 ; SSE-NEXT: movaps %xmm1, 224(%r8)
14010 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14011 ; SSE-NEXT: movaps %xmm1, 208(%r8)
14012 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14013 ; SSE-NEXT: movaps %xmm1, 192(%r8)
14014 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14015 ; SSE-NEXT: movaps %xmm1, 176(%r8)
14016 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14017 ; SSE-NEXT: movaps %xmm1, 160(%r8)
14018 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14019 ; SSE-NEXT: movaps %xmm1, 144(%r8)
14020 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14021 ; SSE-NEXT: movaps %xmm1, 128(%r8)
14022 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14023 ; SSE-NEXT: movaps %xmm1, 112(%r8)
14024 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14025 ; SSE-NEXT: movaps %xmm1, 96(%r8)
14026 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14027 ; SSE-NEXT: movaps %xmm1, 80(%r8)
14028 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14029 ; SSE-NEXT: movaps %xmm1, 64(%r8)
14030 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14031 ; SSE-NEXT: movaps %xmm1, 48(%r8)
14032 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14033 ; SSE-NEXT: movaps %xmm1, 32(%r8)
14034 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14035 ; SSE-NEXT: movaps %xmm1, 16(%r8)
14036 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14037 ; SSE-NEXT: movaps %xmm1, (%r8)
14038 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14039 ; SSE-NEXT: movaps %xmm1, 496(%r9)
14040 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14041 ; SSE-NEXT: movaps %xmm1, 480(%r9)
14042 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14043 ; SSE-NEXT: movaps %xmm1, 464(%r9)
14044 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14045 ; SSE-NEXT: movaps %xmm1, 448(%r9)
14046 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14047 ; SSE-NEXT: movaps %xmm1, 432(%r9)
14048 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14049 ; SSE-NEXT: movaps %xmm1, 416(%r9)
14050 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14051 ; SSE-NEXT: movaps %xmm1, 400(%r9)
14052 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14053 ; SSE-NEXT: movaps %xmm1, 384(%r9)
14054 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14055 ; SSE-NEXT: movaps %xmm1, 368(%r9)
14056 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14057 ; SSE-NEXT: movaps %xmm1, 352(%r9)
14058 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14059 ; SSE-NEXT: movaps %xmm1, 336(%r9)
14060 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14061 ; SSE-NEXT: movaps %xmm1, 320(%r9)
14062 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14063 ; SSE-NEXT: movaps %xmm1, 304(%r9)
14064 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14065 ; SSE-NEXT: movaps %xmm1, 288(%r9)
14066 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14067 ; SSE-NEXT: movaps %xmm1, 272(%r9)
14068 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14069 ; SSE-NEXT: movaps %xmm1, 256(%r9)
14070 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14071 ; SSE-NEXT: movaps %xmm1, 240(%r9)
14072 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14073 ; SSE-NEXT: movaps %xmm1, 224(%r9)
14074 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14075 ; SSE-NEXT: movaps %xmm1, 208(%r9)
14076 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14077 ; SSE-NEXT: movaps %xmm1, 192(%r9)
14078 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14079 ; SSE-NEXT: movaps %xmm1, 176(%r9)
14080 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14081 ; SSE-NEXT: movaps %xmm1, 160(%r9)
14082 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14083 ; SSE-NEXT: movaps %xmm1, 144(%r9)
14084 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14085 ; SSE-NEXT: movaps %xmm1, 128(%r9)
14086 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14087 ; SSE-NEXT: movaps %xmm1, 112(%r9)
14088 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14089 ; SSE-NEXT: movaps %xmm1, 96(%r9)
14090 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14091 ; SSE-NEXT: movaps %xmm1, 80(%r9)
14092 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14093 ; SSE-NEXT: movaps %xmm1, 64(%r9)
14094 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14095 ; SSE-NEXT: movaps %xmm1, 48(%r9)
14096 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14097 ; SSE-NEXT: movaps %xmm1, 32(%r9)
14098 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14099 ; SSE-NEXT: movaps %xmm1, 16(%r9)
14100 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14101 ; SSE-NEXT: movaps %xmm1, (%r9)
14102 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
14103 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14104 ; SSE-NEXT: movaps %xmm1, 496(%rax)
14105 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14106 ; SSE-NEXT: movaps %xmm1, 480(%rax)
14107 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14108 ; SSE-NEXT: movaps %xmm1, 464(%rax)
14109 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14110 ; SSE-NEXT: movaps %xmm1, 448(%rax)
14111 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14112 ; SSE-NEXT: movaps %xmm1, 432(%rax)
14113 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14114 ; SSE-NEXT: movaps %xmm1, 416(%rax)
14115 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14116 ; SSE-NEXT: movaps %xmm1, 400(%rax)
14117 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14118 ; SSE-NEXT: movaps %xmm1, 384(%rax)
14119 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14120 ; SSE-NEXT: movaps %xmm1, 368(%rax)
14121 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14122 ; SSE-NEXT: movaps %xmm1, 352(%rax)
14123 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14124 ; SSE-NEXT: movaps %xmm1, 336(%rax)
14125 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14126 ; SSE-NEXT: movaps %xmm1, 320(%rax)
14127 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14128 ; SSE-NEXT: movaps %xmm1, 304(%rax)
14129 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14130 ; SSE-NEXT: movaps %xmm1, 288(%rax)
14131 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14132 ; SSE-NEXT: movaps %xmm1, 272(%rax)
14133 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14134 ; SSE-NEXT: movaps %xmm1, 256(%rax)
14135 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14136 ; SSE-NEXT: movaps %xmm1, 240(%rax)
14137 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14138 ; SSE-NEXT: movaps %xmm1, 224(%rax)
14139 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14140 ; SSE-NEXT: movaps %xmm1, 208(%rax)
14141 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14142 ; SSE-NEXT: movaps %xmm1, 192(%rax)
14143 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14144 ; SSE-NEXT: movaps %xmm1, 176(%rax)
14145 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14146 ; SSE-NEXT: movaps %xmm1, 160(%rax)
14147 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14148 ; SSE-NEXT: movaps %xmm1, 144(%rax)
14149 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14150 ; SSE-NEXT: movaps %xmm1, 128(%rax)
14151 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14152 ; SSE-NEXT: movaps %xmm1, 112(%rax)
14153 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14154 ; SSE-NEXT: movaps %xmm1, 96(%rax)
14155 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14156 ; SSE-NEXT: movaps %xmm1, 80(%rax)
14157 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14158 ; SSE-NEXT: movaps %xmm1, 64(%rax)
14159 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14160 ; SSE-NEXT: movaps %xmm1, 48(%rax)
14161 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14162 ; SSE-NEXT: movaps %xmm1, 32(%rax)
14163 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14164 ; SSE-NEXT: movaps %xmm1, 16(%rax)
14165 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14166 ; SSE-NEXT: movaps %xmm1, (%rax)
14167 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
14168 ; SSE-NEXT: movaps %xmm3, 496(%rax)
14169 ; SSE-NEXT: movaps %xmm6, 480(%rax)
14170 ; SSE-NEXT: movaps %xmm7, 464(%rax)
14171 ; SSE-NEXT: movaps %xmm10, 448(%rax)
14172 ; SSE-NEXT: movaps %xmm13, 432(%rax)
14173 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14174 ; SSE-NEXT: movaps %xmm1, 416(%rax)
14175 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14176 ; SSE-NEXT: movaps %xmm1, 400(%rax)
14177 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14178 ; SSE-NEXT: movaps %xmm1, 384(%rax)
14179 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14180 ; SSE-NEXT: movaps %xmm1, 368(%rax)
14181 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14182 ; SSE-NEXT: movaps %xmm1, 352(%rax)
14183 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14184 ; SSE-NEXT: movaps %xmm1, 336(%rax)
14185 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14186 ; SSE-NEXT: movaps %xmm1, 320(%rax)
14187 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14188 ; SSE-NEXT: movaps %xmm1, 304(%rax)
14189 ; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
14190 ; SSE-NEXT: movaps %xmm1, 288(%rax)
14191 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14192 ; SSE-NEXT: movaps %xmm1, 272(%rax)
14193 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14194 ; SSE-NEXT: movaps %xmm1, 256(%rax)
14195 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14196 ; SSE-NEXT: movaps %xmm1, 240(%rax)
14197 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14198 ; SSE-NEXT: movaps %xmm1, 224(%rax)
14199 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14200 ; SSE-NEXT: movaps %xmm1, 208(%rax)
14201 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14202 ; SSE-NEXT: movaps %xmm1, 192(%rax)
14203 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14204 ; SSE-NEXT: movaps %xmm1, 176(%rax)
14205 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14206 ; SSE-NEXT: movaps %xmm1, 160(%rax)
14207 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14208 ; SSE-NEXT: movaps %xmm1, 144(%rax)
14209 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14210 ; SSE-NEXT: movaps %xmm1, 128(%rax)
14211 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14212 ; SSE-NEXT: movaps %xmm1, 112(%rax)
14213 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14214 ; SSE-NEXT: movaps %xmm1, 96(%rax)
14215 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14216 ; SSE-NEXT: movaps %xmm1, 80(%rax)
14217 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14218 ; SSE-NEXT: movaps %xmm1, 64(%rax)
14219 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14220 ; SSE-NEXT: movaps %xmm1, 48(%rax)
14221 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14222 ; SSE-NEXT: movaps %xmm1, 32(%rax)
14223 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14224 ; SSE-NEXT: movaps %xmm1, 16(%rax)
14225 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14226 ; SSE-NEXT: movaps %xmm1, (%rax)
14227 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
14228 ; SSE-NEXT: movaps %xmm4, 496(%rax)
14229 ; SSE-NEXT: movaps %xmm5, 480(%rax)
14230 ; SSE-NEXT: movaps %xmm0, 464(%rax)
14231 ; SSE-NEXT: movaps %xmm2, 448(%rax)
14232 ; SSE-NEXT: movaps %xmm8, 432(%rax)
14233 ; SSE-NEXT: movaps %xmm12, 416(%rax)
14234 ; SSE-NEXT: movaps %xmm9, 400(%rax)
14235 ; SSE-NEXT: movaps %xmm15, 384(%rax)
14236 ; SSE-NEXT: movaps %xmm11, 368(%rax)
14237 ; SSE-NEXT: movaps %xmm14, 352(%rax)
14238 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14239 ; SSE-NEXT: movaps %xmm0, 336(%rax)
14240 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14241 ; SSE-NEXT: movaps %xmm0, 320(%rax)
14242 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14243 ; SSE-NEXT: movaps %xmm0, 304(%rax)
14244 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14245 ; SSE-NEXT: movaps %xmm0, 288(%rax)
14246 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14247 ; SSE-NEXT: movaps %xmm0, 272(%rax)
14248 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14249 ; SSE-NEXT: movaps %xmm0, 256(%rax)
14250 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14251 ; SSE-NEXT: movaps %xmm0, 240(%rax)
14252 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14253 ; SSE-NEXT: movaps %xmm0, 224(%rax)
14254 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14255 ; SSE-NEXT: movaps %xmm0, 208(%rax)
14256 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14257 ; SSE-NEXT: movaps %xmm0, 192(%rax)
14258 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14259 ; SSE-NEXT: movaps %xmm0, 176(%rax)
14260 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14261 ; SSE-NEXT: movaps %xmm0, 160(%rax)
14262 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14263 ; SSE-NEXT: movaps %xmm0, 144(%rax)
14264 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14265 ; SSE-NEXT: movaps %xmm0, 128(%rax)
14266 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14267 ; SSE-NEXT: movaps %xmm0, 112(%rax)
14268 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14269 ; SSE-NEXT: movaps %xmm0, 96(%rax)
14270 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14271 ; SSE-NEXT: movaps %xmm0, 80(%rax)
14272 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14273 ; SSE-NEXT: movaps %xmm0, 64(%rax)
14274 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14275 ; SSE-NEXT: movaps %xmm0, 48(%rax)
14276 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14277 ; SSE-NEXT: movaps %xmm0, 32(%rax)
14278 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14279 ; SSE-NEXT: movaps %xmm0, 16(%rax)
14280 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14281 ; SSE-NEXT: movaps %xmm0, (%rax)
14282 ; SSE-NEXT: addq $3720, %rsp # imm = 0xE88
14285 ; AVX-LABEL: load_i64_stride8_vf64:
14287 ; AVX-NEXT: subq $5016, %rsp # imm = 0x1398
14288 ; AVX-NEXT: vmovaps 2496(%rdi), %xmm0
14289 ; AVX-NEXT: vmovaps 2432(%rdi), %xmm1
14290 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14291 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14292 ; AVX-NEXT: vmovaps 3008(%rdi), %xmm2
14293 ; AVX-NEXT: vmovaps 2944(%rdi), %xmm3
14294 ; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0]
14295 ; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14296 ; AVX-NEXT: vmovaps 3520(%rdi), %xmm4
14297 ; AVX-NEXT: vmovaps 3456(%rdi), %xmm5
14298 ; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0]
14299 ; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14300 ; AVX-NEXT: vmovaps 2880(%rdi), %xmm6
14301 ; AVX-NEXT: vmovaps 3392(%rdi), %xmm7
14302 ; AVX-NEXT: vmovaps 3904(%rdi), %xmm8
14303 ; AVX-NEXT: vmovaps 3840(%rdi), %xmm9
14304 ; AVX-NEXT: vmovaps 4032(%rdi), %xmm10
14305 ; AVX-NEXT: vmovaps 3968(%rdi), %xmm11
14306 ; AVX-NEXT: vmovlhps {{.*#+}} xmm12 = xmm11[0],xmm10[0]
14307 ; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14308 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm11[1],xmm10[1]
14309 ; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14310 ; AVX-NEXT: vmovlhps {{.*#+}} xmm10 = xmm9[0],xmm8[0]
14311 ; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14312 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm9[1],xmm8[1]
14313 ; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14314 ; AVX-NEXT: vmovaps 3328(%rdi), %xmm8
14315 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1]
14316 ; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14317 ; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm7[0]
14318 ; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14319 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm8[1],xmm7[1]
14320 ; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14321 ; AVX-NEXT: vmovaps 2816(%rdi), %xmm4
14322 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1]
14323 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14324 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm6[0]
14325 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14326 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm4[1],xmm6[1]
14327 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14328 ; AVX-NEXT: vmovaps 2368(%rdi), %xmm2
14329 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14330 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14331 ; AVX-NEXT: vmovaps 2304(%rdi), %xmm0
14332 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm2[0]
14333 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14334 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
14335 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14336 ; AVX-NEXT: vmovaps 1984(%rdi), %xmm0
14337 ; AVX-NEXT: vmovaps 1920(%rdi), %xmm1
14338 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14339 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14340 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14341 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14342 ; AVX-NEXT: vmovaps 1856(%rdi), %xmm0
14343 ; AVX-NEXT: vmovaps 1792(%rdi), %xmm1
14344 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14345 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14346 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14347 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14348 ; AVX-NEXT: vmovaps 1472(%rdi), %xmm0
14349 ; AVX-NEXT: vmovaps 1408(%rdi), %xmm1
14350 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14351 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14352 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14353 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14354 ; AVX-NEXT: vmovaps 1344(%rdi), %xmm0
14355 ; AVX-NEXT: vmovaps 1280(%rdi), %xmm1
14356 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14357 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14358 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14359 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14360 ; AVX-NEXT: vmovaps 448(%rdi), %xmm0
14361 ; AVX-NEXT: vmovaps 384(%rdi), %xmm1
14362 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14363 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14364 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14365 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14366 ; AVX-NEXT: vmovaps 320(%rdi), %xmm0
14367 ; AVX-NEXT: vmovaps 256(%rdi), %xmm1
14368 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14369 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14370 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14371 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14372 ; AVX-NEXT: vmovaps 960(%rdi), %xmm0
14373 ; AVX-NEXT: vmovaps 896(%rdi), %xmm1
14374 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14375 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14376 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14377 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14378 ; AVX-NEXT: vmovaps 832(%rdi), %xmm0
14379 ; AVX-NEXT: vmovaps 768(%rdi), %xmm1
14380 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14381 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14382 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14383 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14384 ; AVX-NEXT: vmovaps 3776(%rdi), %xmm0
14385 ; AVX-NEXT: vmovaps 3712(%rdi), %xmm1
14386 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14387 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14388 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14389 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14390 ; AVX-NEXT: vmovaps 3648(%rdi), %xmm0
14391 ; AVX-NEXT: vmovaps 3584(%rdi), %xmm1
14392 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14393 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14394 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14395 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14396 ; AVX-NEXT: vmovaps 3264(%rdi), %xmm0
14397 ; AVX-NEXT: vmovaps 3200(%rdi), %xmm1
14398 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14399 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14400 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14401 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14402 ; AVX-NEXT: vmovaps 3136(%rdi), %xmm0
14403 ; AVX-NEXT: vmovaps 3072(%rdi), %xmm1
14404 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14405 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14406 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14407 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14408 ; AVX-NEXT: vmovaps 2752(%rdi), %xmm0
14409 ; AVX-NEXT: vmovaps 2688(%rdi), %xmm1
14410 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14411 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14412 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14413 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14414 ; AVX-NEXT: vmovaps 2624(%rdi), %xmm0
14415 ; AVX-NEXT: vmovaps 2560(%rdi), %xmm1
14416 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14417 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14418 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14419 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14420 ; AVX-NEXT: vmovaps 1728(%rdi), %xmm0
14421 ; AVX-NEXT: vmovaps 1664(%rdi), %xmm1
14422 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14423 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14424 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14425 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14426 ; AVX-NEXT: vmovaps 1600(%rdi), %xmm0
14427 ; AVX-NEXT: vmovaps 1536(%rdi), %xmm1
14428 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14429 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14430 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14431 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14432 ; AVX-NEXT: vmovaps 192(%rdi), %xmm0
14433 ; AVX-NEXT: vmovaps 128(%rdi), %xmm1
14434 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14435 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14436 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14437 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14438 ; AVX-NEXT: vmovaps 64(%rdi), %xmm0
14439 ; AVX-NEXT: vmovaps (%rdi), %xmm1
14440 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14441 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14442 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14443 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14444 ; AVX-NEXT: vmovaps 704(%rdi), %xmm0
14445 ; AVX-NEXT: vmovaps 640(%rdi), %xmm1
14446 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14447 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14448 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14449 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14450 ; AVX-NEXT: vmovaps 576(%rdi), %xmm0
14451 ; AVX-NEXT: vmovaps 512(%rdi), %xmm1
14452 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14453 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14454 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14455 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14456 ; AVX-NEXT: vmovaps 2240(%rdi), %xmm0
14457 ; AVX-NEXT: vmovaps 2176(%rdi), %xmm1
14458 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14459 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14460 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14461 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14462 ; AVX-NEXT: vmovaps 2112(%rdi), %xmm0
14463 ; AVX-NEXT: vmovaps 2048(%rdi), %xmm1
14464 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14465 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14466 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14467 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14468 ; AVX-NEXT: vmovaps 1216(%rdi), %xmm0
14469 ; AVX-NEXT: vmovaps 1152(%rdi), %xmm1
14470 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14471 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14472 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14473 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14474 ; AVX-NEXT: vmovaps 1088(%rdi), %xmm0
14475 ; AVX-NEXT: vmovaps 1024(%rdi), %xmm1
14476 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14477 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14478 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14479 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14480 ; AVX-NEXT: vmovaps 448(%rdi), %ymm1
14481 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14482 ; AVX-NEXT: vmovaps 384(%rdi), %ymm0
14483 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14484 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
14485 ; AVX-NEXT: vmovaps 336(%rdi), %xmm2
14486 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14487 ; AVX-NEXT: vmovaps 272(%rdi), %xmm1
14488 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14489 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
14490 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14491 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14492 ; AVX-NEXT: vmovaps 960(%rdi), %ymm1
14493 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14494 ; AVX-NEXT: vmovaps 896(%rdi), %ymm0
14495 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14496 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
14497 ; AVX-NEXT: vmovaps 848(%rdi), %xmm2
14498 ; AVX-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill
14499 ; AVX-NEXT: vmovaps 784(%rdi), %xmm1
14500 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14501 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
14502 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14503 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14504 ; AVX-NEXT: vmovaps 1472(%rdi), %ymm1
14505 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14506 ; AVX-NEXT: vmovaps 1408(%rdi), %ymm0
14507 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14508 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
14509 ; AVX-NEXT: vmovaps 1360(%rdi), %xmm2
14510 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14511 ; AVX-NEXT: vmovaps 1296(%rdi), %xmm1
14512 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14513 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
14514 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14515 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14516 ; AVX-NEXT: vmovaps 1984(%rdi), %ymm1
14517 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14518 ; AVX-NEXT: vmovaps 1920(%rdi), %ymm0
14519 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14520 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
14521 ; AVX-NEXT: vmovaps 1872(%rdi), %xmm2
14522 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14523 ; AVX-NEXT: vmovaps 1808(%rdi), %xmm1
14524 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14525 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
14526 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14527 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14528 ; AVX-NEXT: vmovaps 2496(%rdi), %ymm1
14529 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14530 ; AVX-NEXT: vmovaps 2432(%rdi), %ymm0
14531 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14532 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
14533 ; AVX-NEXT: vmovaps 2384(%rdi), %xmm2
14534 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14535 ; AVX-NEXT: vmovaps 2320(%rdi), %xmm1
14536 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14537 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
14538 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14539 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14540 ; AVX-NEXT: vmovaps 3008(%rdi), %ymm1
14541 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14542 ; AVX-NEXT: vmovaps 2944(%rdi), %ymm0
14543 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14544 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
14545 ; AVX-NEXT: vmovaps 2896(%rdi), %xmm2
14546 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14547 ; AVX-NEXT: vmovaps 2832(%rdi), %xmm1
14548 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14549 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
14550 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14551 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14552 ; AVX-NEXT: vmovaps 3520(%rdi), %ymm1
14553 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14554 ; AVX-NEXT: vmovaps 3456(%rdi), %ymm0
14555 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14556 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
14557 ; AVX-NEXT: vmovaps 3408(%rdi), %xmm2
14558 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14559 ; AVX-NEXT: vmovaps 3344(%rdi), %xmm1
14560 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14561 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
14562 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14563 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14564 ; AVX-NEXT: vmovaps 4032(%rdi), %ymm1
14565 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14566 ; AVX-NEXT: vmovaps 3968(%rdi), %ymm0
14567 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14568 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
14569 ; AVX-NEXT: vmovaps 3920(%rdi), %xmm2
14570 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14571 ; AVX-NEXT: vmovaps 3856(%rdi), %xmm1
14572 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14573 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
14574 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14575 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14576 ; AVX-NEXT: vmovaps 192(%rdi), %ymm0
14577 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14578 ; AVX-NEXT: vmovaps 128(%rdi), %ymm1
14579 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14580 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
14581 ; AVX-NEXT: vmovaps 80(%rdi), %xmm0
14582 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14583 ; AVX-NEXT: vmovaps 16(%rdi), %xmm8
14584 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm8[0],xmm0[0]
14585 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7]
14586 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14587 ; AVX-NEXT: vmovaps 704(%rdi), %ymm0
14588 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14589 ; AVX-NEXT: vmovaps 640(%rdi), %ymm1
14590 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14591 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
14592 ; AVX-NEXT: vmovaps 592(%rdi), %xmm0
14593 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14594 ; AVX-NEXT: vmovaps 528(%rdi), %xmm1
14595 ; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm0[0]
14596 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7]
14597 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14598 ; AVX-NEXT: vmovaps 1216(%rdi), %ymm0
14599 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14600 ; AVX-NEXT: vmovaps 1152(%rdi), %ymm2
14601 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14602 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
14603 ; AVX-NEXT: vmovaps 1104(%rdi), %xmm10
14604 ; AVX-NEXT: vmovaps 1040(%rdi), %xmm2
14605 ; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm2[0],xmm10[0]
14606 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7]
14607 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14608 ; AVX-NEXT: vmovaps 1728(%rdi), %ymm0
14609 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14610 ; AVX-NEXT: vmovaps 1664(%rdi), %ymm3
14611 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14612 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
14613 ; AVX-NEXT: vmovaps 1616(%rdi), %xmm11
14614 ; AVX-NEXT: vmovaps 1552(%rdi), %xmm3
14615 ; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm11[0]
14616 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7]
14617 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14618 ; AVX-NEXT: vmovaps 2240(%rdi), %ymm0
14619 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14620 ; AVX-NEXT: vmovaps 2176(%rdi), %ymm4
14621 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14622 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm4[0],ymm0[0],ymm4[2],ymm0[2]
14623 ; AVX-NEXT: vmovaps 2128(%rdi), %xmm12
14624 ; AVX-NEXT: vmovaps 2064(%rdi), %xmm4
14625 ; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm12[0]
14626 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5,6,7]
14627 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14628 ; AVX-NEXT: vmovaps 2752(%rdi), %ymm0
14629 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14630 ; AVX-NEXT: vmovaps 2688(%rdi), %ymm5
14631 ; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14632 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm0[0],ymm5[2],ymm0[2]
14633 ; AVX-NEXT: vmovaps 2640(%rdi), %xmm13
14634 ; AVX-NEXT: vmovaps 2576(%rdi), %xmm5
14635 ; AVX-NEXT: vmovlhps {{.*#+}} xmm7 = xmm5[0],xmm13[0]
14636 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm6[4,5,6,7]
14637 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14638 ; AVX-NEXT: vmovaps 3264(%rdi), %ymm0
14639 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14640 ; AVX-NEXT: vmovaps 3200(%rdi), %ymm6
14641 ; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14642 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm6[0],ymm0[0],ymm6[2],ymm0[2]
14643 ; AVX-NEXT: vmovaps 3152(%rdi), %xmm14
14644 ; AVX-NEXT: vmovaps 3088(%rdi), %xmm6
14645 ; AVX-NEXT: vmovlhps {{.*#+}} xmm15 = xmm6[0],xmm14[0]
14646 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm7[4,5,6,7]
14647 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14648 ; AVX-NEXT: vmovaps 3776(%rdi), %ymm7
14649 ; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14650 ; AVX-NEXT: vmovaps 3712(%rdi), %ymm0
14651 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14652 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2]
14653 ; AVX-NEXT: vmovaps 3664(%rdi), %xmm15
14654 ; AVX-NEXT: vmovaps 3600(%rdi), %xmm7
14655 ; AVX-NEXT: vmovlhps {{.*#+}} xmm9 = xmm7[0],xmm15[0]
14656 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
14657 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14658 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14659 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14660 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
14661 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
14662 ; AVX-NEXT: # xmm8 = xmm8[1],mem[1]
14663 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
14664 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14665 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14666 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14667 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
14668 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
14669 ; AVX-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
14670 ; AVX-NEXT: # xmm8 = mem[0,1],xmm8[2,3]
14671 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
14672 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14673 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14674 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14675 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
14676 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14677 ; AVX-NEXT: # xmm1 = xmm1[1],mem[1]
14678 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14679 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14680 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14681 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14682 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
14683 ; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
14684 ; AVX-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14685 ; AVX-NEXT: # xmm1 = mem[0,1],xmm1[2,3]
14686 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14687 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14688 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14689 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14690 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
14691 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm10[1]
14692 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14693 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14694 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14695 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14696 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
14697 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14698 ; AVX-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14699 ; AVX-NEXT: # xmm1 = mem[0,1],xmm1[2,3]
14700 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14701 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14702 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14703 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14704 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
14705 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm11[1]
14706 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14707 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14708 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14709 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14710 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
14711 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14712 ; AVX-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14713 ; AVX-NEXT: # xmm1 = mem[0,1],xmm1[2,3]
14714 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14715 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14716 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14717 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14718 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
14719 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm12[1]
14720 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14721 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14722 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14723 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14724 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
14725 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14726 ; AVX-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14727 ; AVX-NEXT: # xmm1 = mem[0,1],xmm1[2,3]
14728 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14729 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14730 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14731 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14732 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
14733 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm13[1]
14734 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14735 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14736 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14737 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14738 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
14739 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14740 ; AVX-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14741 ; AVX-NEXT: # xmm1 = mem[0,1],xmm1[2,3]
14742 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14743 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14744 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14745 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14746 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
14747 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm14[1]
14748 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14749 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14750 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14751 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14752 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
14753 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14754 ; AVX-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14755 ; AVX-NEXT: # xmm1 = mem[0,1],xmm1[2,3]
14756 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14757 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14758 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14759 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14760 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
14761 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm15[1]
14762 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14763 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14764 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14765 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14766 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
14767 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14768 ; AVX-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14769 ; AVX-NEXT: # xmm1 = mem[0,1],xmm1[2,3]
14770 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14771 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14772 ; AVX-NEXT: vmovaps 96(%rdi), %xmm0
14773 ; AVX-NEXT: vmovaps 32(%rdi), %xmm1
14774 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14775 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14776 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14777 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14778 ; AVX-NEXT: vmovaps 224(%rdi), %xmm0
14779 ; AVX-NEXT: vmovaps 160(%rdi), %xmm1
14780 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14781 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14782 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14783 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14784 ; AVX-NEXT: vmovaps 352(%rdi), %xmm0
14785 ; AVX-NEXT: vmovaps 288(%rdi), %xmm1
14786 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14787 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14788 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14789 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14790 ; AVX-NEXT: vmovaps 480(%rdi), %xmm0
14791 ; AVX-NEXT: vmovaps 416(%rdi), %xmm1
14792 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14793 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14794 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14795 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14796 ; AVX-NEXT: vmovaps 608(%rdi), %xmm0
14797 ; AVX-NEXT: vmovaps 544(%rdi), %xmm1
14798 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14799 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14800 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14801 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14802 ; AVX-NEXT: vmovaps 736(%rdi), %xmm0
14803 ; AVX-NEXT: vmovaps 672(%rdi), %xmm1
14804 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14805 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14806 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14807 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14808 ; AVX-NEXT: vmovaps 864(%rdi), %xmm0
14809 ; AVX-NEXT: vmovaps 800(%rdi), %xmm1
14810 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14811 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14812 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14813 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14814 ; AVX-NEXT: vmovaps 992(%rdi), %xmm0
14815 ; AVX-NEXT: vmovaps 928(%rdi), %xmm1
14816 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14817 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14818 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14819 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14820 ; AVX-NEXT: vmovaps 1120(%rdi), %xmm0
14821 ; AVX-NEXT: vmovaps 1056(%rdi), %xmm1
14822 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14823 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14824 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14825 ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
14826 ; AVX-NEXT: vmovaps 1248(%rdi), %xmm0
14827 ; AVX-NEXT: vmovaps 1184(%rdi), %xmm1
14828 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14829 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14830 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14831 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14832 ; AVX-NEXT: vmovaps 1376(%rdi), %xmm0
14833 ; AVX-NEXT: vmovaps 1312(%rdi), %xmm1
14834 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14835 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14836 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14837 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14838 ; AVX-NEXT: vmovaps 1504(%rdi), %xmm0
14839 ; AVX-NEXT: vmovaps 1440(%rdi), %xmm1
14840 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14841 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14842 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14843 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14844 ; AVX-NEXT: vmovaps 1632(%rdi), %xmm0
14845 ; AVX-NEXT: vmovaps 1568(%rdi), %xmm1
14846 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14847 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14848 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14849 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14850 ; AVX-NEXT: vmovaps 1760(%rdi), %xmm0
14851 ; AVX-NEXT: vmovaps 1696(%rdi), %xmm1
14852 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14853 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14854 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14855 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14856 ; AVX-NEXT: vmovaps 1888(%rdi), %xmm0
14857 ; AVX-NEXT: vmovaps 1824(%rdi), %xmm1
14858 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14859 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14860 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14861 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14862 ; AVX-NEXT: vmovaps 2016(%rdi), %xmm0
14863 ; AVX-NEXT: vmovaps 1952(%rdi), %xmm1
14864 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14865 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14866 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14867 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14868 ; AVX-NEXT: vmovaps 2144(%rdi), %xmm0
14869 ; AVX-NEXT: vmovaps 2080(%rdi), %xmm1
14870 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14871 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14872 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14873 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14874 ; AVX-NEXT: vmovaps 2272(%rdi), %xmm0
14875 ; AVX-NEXT: vmovaps 2208(%rdi), %xmm1
14876 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14877 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14878 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14879 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14880 ; AVX-NEXT: vmovaps 2400(%rdi), %xmm0
14881 ; AVX-NEXT: vmovaps 2336(%rdi), %xmm1
14882 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14883 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14884 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14885 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14886 ; AVX-NEXT: vmovaps 2528(%rdi), %xmm0
14887 ; AVX-NEXT: vmovaps 2464(%rdi), %xmm1
14888 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14889 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14890 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14891 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14892 ; AVX-NEXT: vmovaps 2656(%rdi), %xmm0
14893 ; AVX-NEXT: vmovaps 2592(%rdi), %xmm1
14894 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14895 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14896 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14897 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14898 ; AVX-NEXT: vmovaps 2784(%rdi), %xmm0
14899 ; AVX-NEXT: vmovaps 2720(%rdi), %xmm1
14900 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14901 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14902 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14903 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14904 ; AVX-NEXT: vmovaps 2912(%rdi), %xmm0
14905 ; AVX-NEXT: vmovaps 2848(%rdi), %xmm1
14906 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14907 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14908 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14909 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14910 ; AVX-NEXT: vmovaps 3040(%rdi), %xmm0
14911 ; AVX-NEXT: vmovaps 2976(%rdi), %xmm1
14912 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14913 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14914 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14915 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14916 ; AVX-NEXT: vmovaps 3168(%rdi), %xmm0
14917 ; AVX-NEXT: vmovaps 3104(%rdi), %xmm1
14918 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14919 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14920 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14921 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14922 ; AVX-NEXT: vmovaps 3296(%rdi), %xmm0
14923 ; AVX-NEXT: vmovaps 3232(%rdi), %xmm1
14924 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14925 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14926 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14927 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14928 ; AVX-NEXT: vmovaps 3424(%rdi), %xmm0
14929 ; AVX-NEXT: vmovaps 3360(%rdi), %xmm1
14930 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14931 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14932 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14933 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14934 ; AVX-NEXT: vmovaps 3552(%rdi), %xmm0
14935 ; AVX-NEXT: vmovaps 3488(%rdi), %xmm1
14936 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14937 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14938 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14939 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14940 ; AVX-NEXT: vmovaps 3680(%rdi), %xmm0
14941 ; AVX-NEXT: vmovaps 3616(%rdi), %xmm1
14942 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14943 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14944 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14945 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14946 ; AVX-NEXT: vmovaps 3808(%rdi), %xmm0
14947 ; AVX-NEXT: vmovaps 3744(%rdi), %xmm1
14948 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14949 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14950 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14951 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14952 ; AVX-NEXT: vmovaps 3936(%rdi), %xmm0
14953 ; AVX-NEXT: vmovaps 3872(%rdi), %xmm1
14954 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14955 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14956 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14957 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14958 ; AVX-NEXT: vmovaps 4064(%rdi), %xmm0
14959 ; AVX-NEXT: vmovaps 4000(%rdi), %xmm1
14960 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
14961 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14962 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
14963 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14964 ; AVX-NEXT: vmovaps 224(%rdi), %ymm1
14965 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14966 ; AVX-NEXT: vmovaps 160(%rdi), %ymm0
14967 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14968 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
14969 ; AVX-NEXT: vmovaps 112(%rdi), %xmm2
14970 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14971 ; AVX-NEXT: vmovaps 48(%rdi), %xmm1
14972 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14973 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
14974 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14975 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14976 ; AVX-NEXT: vmovaps 480(%rdi), %ymm1
14977 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14978 ; AVX-NEXT: vmovaps 416(%rdi), %ymm0
14979 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14980 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
14981 ; AVX-NEXT: vmovaps 368(%rdi), %xmm2
14982 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14983 ; AVX-NEXT: vmovaps 304(%rdi), %xmm1
14984 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14985 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
14986 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14987 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14988 ; AVX-NEXT: vmovaps 736(%rdi), %ymm1
14989 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14990 ; AVX-NEXT: vmovaps 672(%rdi), %ymm0
14991 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14992 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
14993 ; AVX-NEXT: vmovaps 624(%rdi), %xmm2
14994 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14995 ; AVX-NEXT: vmovaps 560(%rdi), %xmm1
14996 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14997 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
14998 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14999 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15000 ; AVX-NEXT: vmovaps 992(%rdi), %ymm1
15001 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15002 ; AVX-NEXT: vmovaps 928(%rdi), %ymm0
15003 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15004 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
15005 ; AVX-NEXT: vmovaps 880(%rdi), %xmm2
15006 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15007 ; AVX-NEXT: vmovaps 816(%rdi), %xmm1
15008 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15009 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
15010 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15011 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15012 ; AVX-NEXT: vmovaps 1248(%rdi), %ymm1
15013 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15014 ; AVX-NEXT: vmovaps 1184(%rdi), %ymm0
15015 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15016 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
15017 ; AVX-NEXT: vmovaps 1136(%rdi), %xmm2
15018 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15019 ; AVX-NEXT: vmovaps 1072(%rdi), %xmm1
15020 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15021 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
15022 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15023 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15024 ; AVX-NEXT: vmovaps 1504(%rdi), %ymm1
15025 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15026 ; AVX-NEXT: vmovaps 1440(%rdi), %ymm0
15027 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15028 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
15029 ; AVX-NEXT: vmovaps 1392(%rdi), %xmm2
15030 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15031 ; AVX-NEXT: vmovaps 1328(%rdi), %xmm1
15032 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15033 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
15034 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15035 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15036 ; AVX-NEXT: vmovaps 1760(%rdi), %ymm1
15037 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15038 ; AVX-NEXT: vmovaps 1696(%rdi), %ymm0
15039 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15040 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
15041 ; AVX-NEXT: vmovaps 1648(%rdi), %xmm2
15042 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15043 ; AVX-NEXT: vmovaps 1584(%rdi), %xmm1
15044 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15045 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
15046 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15047 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15048 ; AVX-NEXT: vmovaps 2016(%rdi), %ymm1
15049 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15050 ; AVX-NEXT: vmovaps 1952(%rdi), %ymm0
15051 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15052 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
15053 ; AVX-NEXT: vmovaps 1904(%rdi), %xmm2
15054 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15055 ; AVX-NEXT: vmovaps 1840(%rdi), %xmm1
15056 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15057 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
15058 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15059 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15060 ; AVX-NEXT: vmovaps 2272(%rdi), %ymm1
15061 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15062 ; AVX-NEXT: vmovaps 2208(%rdi), %ymm0
15063 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15064 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
15065 ; AVX-NEXT: vmovaps 2160(%rdi), %xmm2
15066 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15067 ; AVX-NEXT: vmovaps 2096(%rdi), %xmm1
15068 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15069 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
15070 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15071 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15072 ; AVX-NEXT: vmovaps 2528(%rdi), %ymm1
15073 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15074 ; AVX-NEXT: vmovaps 2464(%rdi), %ymm0
15075 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15076 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
15077 ; AVX-NEXT: vmovaps 2416(%rdi), %xmm2
15078 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15079 ; AVX-NEXT: vmovaps 2352(%rdi), %xmm1
15080 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15081 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
15082 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15083 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15084 ; AVX-NEXT: vmovaps 2784(%rdi), %ymm1
15085 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15086 ; AVX-NEXT: vmovaps 2720(%rdi), %ymm0
15087 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15088 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
15089 ; AVX-NEXT: vmovaps 2672(%rdi), %xmm2
15090 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15091 ; AVX-NEXT: vmovaps 2608(%rdi), %xmm1
15092 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15093 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
15094 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15095 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15096 ; AVX-NEXT: vmovaps 3040(%rdi), %ymm1
15097 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15098 ; AVX-NEXT: vmovaps 2976(%rdi), %ymm0
15099 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15100 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
15101 ; AVX-NEXT: vmovaps 2928(%rdi), %xmm2
15102 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15103 ; AVX-NEXT: vmovaps 2864(%rdi), %xmm1
15104 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15105 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
15106 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15107 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15108 ; AVX-NEXT: vmovaps 3296(%rdi), %ymm1
15109 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15110 ; AVX-NEXT: vmovaps 3232(%rdi), %ymm0
15111 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15112 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
15113 ; AVX-NEXT: vmovaps 3184(%rdi), %xmm14
15114 ; AVX-NEXT: vmovaps 3120(%rdi), %xmm13
15115 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0]
15116 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15117 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15118 ; AVX-NEXT: vmovaps 3552(%rdi), %ymm12
15119 ; AVX-NEXT: vmovaps 3488(%rdi), %ymm11
15120 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2]
15121 ; AVX-NEXT: vmovaps 3440(%rdi), %xmm10
15122 ; AVX-NEXT: vmovaps 3376(%rdi), %xmm9
15123 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm10[0]
15124 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15125 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15126 ; AVX-NEXT: vmovaps 3808(%rdi), %ymm8
15127 ; AVX-NEXT: vmovaps 3744(%rdi), %ymm7
15128 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2]
15129 ; AVX-NEXT: vmovaps 3696(%rdi), %xmm6
15130 ; AVX-NEXT: vmovaps 3632(%rdi), %xmm5
15131 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm6[0]
15132 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15133 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15134 ; AVX-NEXT: vmovaps 4064(%rdi), %ymm4
15135 ; AVX-NEXT: vmovaps 4000(%rdi), %ymm3
15136 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
15137 ; AVX-NEXT: vmovaps 3952(%rdi), %xmm2
15138 ; AVX-NEXT: vmovaps 3888(%rdi), %xmm1
15139 ; AVX-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0]
15140 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
15141 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15142 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15143 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15144 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
15145 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
15146 ; AVX-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
15147 ; AVX-NEXT: # xmm15 = mem[0,1],xmm15[2,3]
15148 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
15149 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15150 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15151 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15152 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
15153 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
15154 ; AVX-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
15155 ; AVX-NEXT: # xmm15 = mem[0,1],xmm15[2,3]
15156 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
15157 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15158 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15159 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15160 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
15161 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
15162 ; AVX-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
15163 ; AVX-NEXT: # xmm15 = mem[0,1],xmm15[2,3]
15164 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
15165 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15166 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15167 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15168 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
15169 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
15170 ; AVX-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
15171 ; AVX-NEXT: # xmm15 = mem[0,1],xmm15[2,3]
15172 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
15173 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15174 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15175 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15176 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
15177 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
15178 ; AVX-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
15179 ; AVX-NEXT: # xmm15 = mem[0,1],xmm15[2,3]
15180 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
15181 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15182 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15183 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15184 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
15185 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
15186 ; AVX-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
15187 ; AVX-NEXT: # xmm15 = mem[0,1],xmm15[2,3]
15188 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
15189 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15190 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15191 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15192 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
15193 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
15194 ; AVX-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
15195 ; AVX-NEXT: # xmm15 = mem[0,1],xmm15[2,3]
15196 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
15197 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15198 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15199 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15200 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
15201 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
15202 ; AVX-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
15203 ; AVX-NEXT: # xmm15 = mem[0,1],xmm15[2,3]
15204 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
15205 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15206 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15207 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15208 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
15209 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
15210 ; AVX-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
15211 ; AVX-NEXT: # xmm15 = mem[0,1],xmm15[2,3]
15212 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
15213 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15214 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15215 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15216 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
15217 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
15218 ; AVX-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
15219 ; AVX-NEXT: # xmm15 = mem[0,1],xmm15[2,3]
15220 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
15221 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15222 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15223 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15224 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
15225 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
15226 ; AVX-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
15227 ; AVX-NEXT: # xmm15 = mem[0,1],xmm15[2,3]
15228 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
15229 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15230 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15231 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15232 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
15233 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
15234 ; AVX-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
15235 ; AVX-NEXT: # xmm15 = mem[0,1],xmm15[2,3]
15236 ; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7]
15237 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15238 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15239 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
15240 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm13[1],xmm14[1]
15241 ; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7]
15242 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm12[1],ymm11[3],ymm12[3]
15243 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1]
15244 ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7]
15245 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3]
15246 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1]
15247 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
15248 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3]
15249 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
15250 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15251 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15252 ; AVX-NEXT: vmovaps %xmm1, 464(%rsi)
15253 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15254 ; AVX-NEXT: vmovaps %xmm1, 448(%rsi)
15255 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15256 ; AVX-NEXT: vmovaps %xmm1, 256(%rsi)
15257 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15258 ; AVX-NEXT: vmovaps %xmm1, 384(%rsi)
15259 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15260 ; AVX-NEXT: vmovaps %xmm1, 320(%rsi)
15261 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15262 ; AVX-NEXT: vmovaps %xmm1, 192(%rsi)
15263 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15264 ; AVX-NEXT: vmovaps %xmm1, 128(%rsi)
15265 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15266 ; AVX-NEXT: vmovaps %xmm1, 64(%rsi)
15267 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15268 ; AVX-NEXT: vmovaps %xmm1, (%rsi)
15269 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15270 ; AVX-NEXT: vmovaps %xmm1, 272(%rsi)
15271 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15272 ; AVX-NEXT: vmovaps %xmm1, 400(%rsi)
15273 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15274 ; AVX-NEXT: vmovaps %xmm1, 336(%rsi)
15275 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15276 ; AVX-NEXT: vmovaps %xmm1, 208(%rsi)
15277 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15278 ; AVX-NEXT: vmovaps %xmm1, 144(%rsi)
15279 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15280 ; AVX-NEXT: vmovaps %xmm1, 80(%rsi)
15281 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15282 ; AVX-NEXT: vmovaps %xmm1, 16(%rsi)
15283 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15284 ; AVX-NEXT: vmovaps %xmm1, 496(%rsi)
15285 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15286 ; AVX-NEXT: vmovaps %xmm1, 480(%rsi)
15287 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15288 ; AVX-NEXT: vmovaps %xmm1, 32(%rsi)
15289 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15290 ; AVX-NEXT: vmovaps %xmm1, 416(%rsi)
15291 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15292 ; AVX-NEXT: vmovaps %xmm1, 352(%rsi)
15293 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15294 ; AVX-NEXT: vmovaps %xmm1, 288(%rsi)
15295 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15296 ; AVX-NEXT: vmovaps %xmm1, 224(%rsi)
15297 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15298 ; AVX-NEXT: vmovaps %xmm1, 160(%rsi)
15299 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15300 ; AVX-NEXT: vmovaps %xmm1, 96(%rsi)
15301 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15302 ; AVX-NEXT: vmovaps %xmm1, 48(%rsi)
15303 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15304 ; AVX-NEXT: vmovaps %xmm1, 432(%rsi)
15305 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15306 ; AVX-NEXT: vmovaps %xmm1, 368(%rsi)
15307 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15308 ; AVX-NEXT: vmovaps %xmm1, 304(%rsi)
15309 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15310 ; AVX-NEXT: vmovaps %xmm1, 240(%rsi)
15311 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15312 ; AVX-NEXT: vmovaps %xmm1, 176(%rsi)
15313 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15314 ; AVX-NEXT: vmovaps %xmm1, 112(%rsi)
15315 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15316 ; AVX-NEXT: vmovaps %xmm1, 128(%rdx)
15317 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15318 ; AVX-NEXT: vmovaps %xmm1, 144(%rdx)
15319 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15320 ; AVX-NEXT: vmovaps %xmm1, 256(%rdx)
15321 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15322 ; AVX-NEXT: vmovaps %xmm1, 272(%rdx)
15323 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15324 ; AVX-NEXT: vmovaps %xmm1, 64(%rdx)
15325 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15326 ; AVX-NEXT: vmovaps %xmm1, 80(%rdx)
15327 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15328 ; AVX-NEXT: vmovaps %xmm1, (%rdx)
15329 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15330 ; AVX-NEXT: vmovaps %xmm1, 16(%rdx)
15331 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15332 ; AVX-NEXT: vmovaps %xmm1, 192(%rdx)
15333 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15334 ; AVX-NEXT: vmovaps %xmm1, 208(%rdx)
15335 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15336 ; AVX-NEXT: vmovaps %xmm1, 320(%rdx)
15337 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15338 ; AVX-NEXT: vmovaps %xmm1, 336(%rdx)
15339 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15340 ; AVX-NEXT: vmovaps %xmm1, 384(%rdx)
15341 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15342 ; AVX-NEXT: vmovaps %xmm1, 400(%rdx)
15343 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15344 ; AVX-NEXT: vmovaps %xmm1, 448(%rdx)
15345 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15346 ; AVX-NEXT: vmovaps %xmm1, 464(%rdx)
15347 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15348 ; AVX-NEXT: vmovaps %xmm1, 96(%rdx)
15349 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15350 ; AVX-NEXT: vmovaps %xmm1, 112(%rdx)
15351 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15352 ; AVX-NEXT: vmovaps %xmm1, 32(%rdx)
15353 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15354 ; AVX-NEXT: vmovaps %xmm1, 48(%rdx)
15355 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15356 ; AVX-NEXT: vmovaps %xmm1, 160(%rdx)
15357 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15358 ; AVX-NEXT: vmovaps %xmm1, 176(%rdx)
15359 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15360 ; AVX-NEXT: vmovaps %xmm1, 224(%rdx)
15361 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15362 ; AVX-NEXT: vmovaps %xmm1, 240(%rdx)
15363 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15364 ; AVX-NEXT: vmovaps %xmm1, 288(%rdx)
15365 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15366 ; AVX-NEXT: vmovaps %xmm1, 304(%rdx)
15367 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15368 ; AVX-NEXT: vmovaps %xmm1, 352(%rdx)
15369 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15370 ; AVX-NEXT: vmovaps %xmm1, 368(%rdx)
15371 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15372 ; AVX-NEXT: vmovaps %xmm1, 416(%rdx)
15373 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15374 ; AVX-NEXT: vmovaps %xmm1, 432(%rdx)
15375 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15376 ; AVX-NEXT: vmovaps %xmm1, 480(%rdx)
15377 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15378 ; AVX-NEXT: vmovaps %xmm1, 496(%rdx)
15379 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15380 ; AVX-NEXT: vmovaps %ymm1, 448(%rcx)
15381 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15382 ; AVX-NEXT: vmovaps %ymm1, 384(%rcx)
15383 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15384 ; AVX-NEXT: vmovaps %ymm1, 320(%rcx)
15385 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15386 ; AVX-NEXT: vmovaps %ymm1, 256(%rcx)
15387 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15388 ; AVX-NEXT: vmovaps %ymm1, 192(%rcx)
15389 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15390 ; AVX-NEXT: vmovaps %ymm1, 128(%rcx)
15391 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15392 ; AVX-NEXT: vmovaps %ymm1, 64(%rcx)
15393 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15394 ; AVX-NEXT: vmovaps %ymm1, (%rcx)
15395 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15396 ; AVX-NEXT: vmovaps %ymm1, 480(%rcx)
15397 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15398 ; AVX-NEXT: vmovaps %ymm1, 416(%rcx)
15399 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15400 ; AVX-NEXT: vmovaps %ymm1, 352(%rcx)
15401 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15402 ; AVX-NEXT: vmovaps %ymm1, 288(%rcx)
15403 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15404 ; AVX-NEXT: vmovaps %ymm1, 224(%rcx)
15405 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15406 ; AVX-NEXT: vmovaps %ymm1, 160(%rcx)
15407 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15408 ; AVX-NEXT: vmovaps %ymm1, 96(%rcx)
15409 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15410 ; AVX-NEXT: vmovaps %ymm1, 32(%rcx)
15411 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15412 ; AVX-NEXT: vmovaps %ymm1, 480(%r8)
15413 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15414 ; AVX-NEXT: vmovaps %ymm1, 448(%r8)
15415 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15416 ; AVX-NEXT: vmovaps %ymm1, 416(%r8)
15417 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15418 ; AVX-NEXT: vmovaps %ymm1, 384(%r8)
15419 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15420 ; AVX-NEXT: vmovaps %ymm1, 352(%r8)
15421 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15422 ; AVX-NEXT: vmovaps %ymm1, 320(%r8)
15423 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15424 ; AVX-NEXT: vmovaps %ymm1, 288(%r8)
15425 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15426 ; AVX-NEXT: vmovaps %ymm1, 256(%r8)
15427 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15428 ; AVX-NEXT: vmovaps %ymm1, 224(%r8)
15429 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15430 ; AVX-NEXT: vmovaps %ymm1, 192(%r8)
15431 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15432 ; AVX-NEXT: vmovaps %ymm1, 160(%r8)
15433 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15434 ; AVX-NEXT: vmovaps %ymm1, 128(%r8)
15435 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15436 ; AVX-NEXT: vmovaps %ymm1, 96(%r8)
15437 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15438 ; AVX-NEXT: vmovaps %ymm1, 64(%r8)
15439 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15440 ; AVX-NEXT: vmovaps %ymm1, 32(%r8)
15441 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15442 ; AVX-NEXT: vmovaps %ymm1, (%r8)
15443 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15444 ; AVX-NEXT: vmovaps %xmm1, 496(%r9)
15445 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15446 ; AVX-NEXT: vmovaps %xmm1, 480(%r9)
15447 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15448 ; AVX-NEXT: vmovaps %xmm1, 464(%r9)
15449 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15450 ; AVX-NEXT: vmovaps %xmm1, 448(%r9)
15451 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15452 ; AVX-NEXT: vmovaps %xmm1, 432(%r9)
15453 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15454 ; AVX-NEXT: vmovaps %xmm1, 416(%r9)
15455 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15456 ; AVX-NEXT: vmovaps %xmm1, 400(%r9)
15457 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15458 ; AVX-NEXT: vmovaps %xmm1, 384(%r9)
15459 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15460 ; AVX-NEXT: vmovaps %xmm1, 368(%r9)
15461 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15462 ; AVX-NEXT: vmovaps %xmm1, 352(%r9)
15463 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15464 ; AVX-NEXT: vmovaps %xmm1, 336(%r9)
15465 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15466 ; AVX-NEXT: vmovaps %xmm1, 320(%r9)
15467 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15468 ; AVX-NEXT: vmovaps %xmm1, 304(%r9)
15469 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15470 ; AVX-NEXT: vmovaps %xmm1, 288(%r9)
15471 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15472 ; AVX-NEXT: vmovaps %xmm1, 272(%r9)
15473 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15474 ; AVX-NEXT: vmovaps %xmm1, 256(%r9)
15475 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15476 ; AVX-NEXT: vmovaps %xmm1, 240(%r9)
15477 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15478 ; AVX-NEXT: vmovaps %xmm1, 224(%r9)
15479 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15480 ; AVX-NEXT: vmovaps %xmm1, 208(%r9)
15481 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15482 ; AVX-NEXT: vmovaps %xmm1, 192(%r9)
15483 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15484 ; AVX-NEXT: vmovaps %xmm1, 176(%r9)
15485 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15486 ; AVX-NEXT: vmovaps %xmm1, 160(%r9)
15487 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15488 ; AVX-NEXT: vmovaps %xmm1, 144(%r9)
15489 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15490 ; AVX-NEXT: vmovaps %xmm1, 128(%r9)
15491 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15492 ; AVX-NEXT: vmovaps %xmm1, 112(%r9)
15493 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15494 ; AVX-NEXT: vmovaps %xmm1, 96(%r9)
15495 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15496 ; AVX-NEXT: vmovaps %xmm1, 80(%r9)
15497 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15498 ; AVX-NEXT: vmovaps %xmm1, 64(%r9)
15499 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15500 ; AVX-NEXT: vmovaps %xmm1, 48(%r9)
15501 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15502 ; AVX-NEXT: vmovaps %xmm1, 32(%r9)
15503 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15504 ; AVX-NEXT: vmovaps %xmm1, 16(%r9)
15505 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15506 ; AVX-NEXT: vmovaps %xmm1, (%r9)
15507 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
15508 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15509 ; AVX-NEXT: vmovaps %xmm1, 496(%rax)
15510 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15511 ; AVX-NEXT: vmovaps %xmm1, 480(%rax)
15512 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15513 ; AVX-NEXT: vmovaps %xmm1, 464(%rax)
15514 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15515 ; AVX-NEXT: vmovaps %xmm1, 448(%rax)
15516 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15517 ; AVX-NEXT: vmovaps %xmm1, 432(%rax)
15518 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15519 ; AVX-NEXT: vmovaps %xmm1, 416(%rax)
15520 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15521 ; AVX-NEXT: vmovaps %xmm1, 400(%rax)
15522 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15523 ; AVX-NEXT: vmovaps %xmm1, 384(%rax)
15524 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15525 ; AVX-NEXT: vmovaps %xmm1, 368(%rax)
15526 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15527 ; AVX-NEXT: vmovaps %xmm1, 352(%rax)
15528 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15529 ; AVX-NEXT: vmovaps %xmm1, 336(%rax)
15530 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15531 ; AVX-NEXT: vmovaps %xmm1, 320(%rax)
15532 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15533 ; AVX-NEXT: vmovaps %xmm1, 304(%rax)
15534 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15535 ; AVX-NEXT: vmovaps %xmm1, 288(%rax)
15536 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15537 ; AVX-NEXT: vmovaps %xmm1, 272(%rax)
15538 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15539 ; AVX-NEXT: vmovaps %xmm1, 256(%rax)
15540 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15541 ; AVX-NEXT: vmovaps %xmm1, 240(%rax)
15542 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15543 ; AVX-NEXT: vmovaps %xmm1, 224(%rax)
15544 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15545 ; AVX-NEXT: vmovaps %xmm1, 208(%rax)
15546 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15547 ; AVX-NEXT: vmovaps %xmm1, 192(%rax)
15548 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15549 ; AVX-NEXT: vmovaps %xmm1, 176(%rax)
15550 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15551 ; AVX-NEXT: vmovaps %xmm1, 160(%rax)
15552 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15553 ; AVX-NEXT: vmovaps %xmm1, 144(%rax)
15554 ; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
15555 ; AVX-NEXT: vmovaps %xmm1, 128(%rax)
15556 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15557 ; AVX-NEXT: vmovaps %xmm1, 112(%rax)
15558 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15559 ; AVX-NEXT: vmovaps %xmm1, 96(%rax)
15560 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15561 ; AVX-NEXT: vmovaps %xmm1, 80(%rax)
15562 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15563 ; AVX-NEXT: vmovaps %xmm1, 64(%rax)
15564 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15565 ; AVX-NEXT: vmovaps %xmm1, 48(%rax)
15566 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15567 ; AVX-NEXT: vmovaps %xmm1, 32(%rax)
15568 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15569 ; AVX-NEXT: vmovaps %xmm1, 16(%rax)
15570 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15571 ; AVX-NEXT: vmovaps %xmm1, (%rax)
15572 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
15573 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15574 ; AVX-NEXT: vmovaps %ymm1, 480(%rax)
15575 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15576 ; AVX-NEXT: vmovaps %ymm1, 448(%rax)
15577 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15578 ; AVX-NEXT: vmovaps %ymm1, 416(%rax)
15579 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15580 ; AVX-NEXT: vmovaps %ymm1, 384(%rax)
15581 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15582 ; AVX-NEXT: vmovaps %ymm1, 352(%rax)
15583 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15584 ; AVX-NEXT: vmovaps %ymm1, 320(%rax)
15585 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15586 ; AVX-NEXT: vmovaps %ymm1, 288(%rax)
15587 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15588 ; AVX-NEXT: vmovaps %ymm1, 256(%rax)
15589 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15590 ; AVX-NEXT: vmovaps %ymm1, 224(%rax)
15591 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15592 ; AVX-NEXT: vmovaps %ymm1, 192(%rax)
15593 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15594 ; AVX-NEXT: vmovaps %ymm1, 160(%rax)
15595 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15596 ; AVX-NEXT: vmovaps %ymm1, 128(%rax)
15597 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15598 ; AVX-NEXT: vmovaps %ymm1, 96(%rax)
15599 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15600 ; AVX-NEXT: vmovaps %ymm1, 64(%rax)
15601 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15602 ; AVX-NEXT: vmovaps %ymm1, 32(%rax)
15603 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15604 ; AVX-NEXT: vmovaps %ymm1, (%rax)
15605 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
15606 ; AVX-NEXT: vmovaps %ymm0, 480(%rax)
15607 ; AVX-NEXT: vmovaps %ymm5, 448(%rax)
15608 ; AVX-NEXT: vmovaps %ymm9, 416(%rax)
15609 ; AVX-NEXT: vmovaps %ymm13, 384(%rax)
15610 ; AVX-NEXT: vmovaps %ymm15, 352(%rax)
15611 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15612 ; AVX-NEXT: vmovaps %ymm0, 320(%rax)
15613 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15614 ; AVX-NEXT: vmovaps %ymm0, 288(%rax)
15615 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15616 ; AVX-NEXT: vmovaps %ymm0, 256(%rax)
15617 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15618 ; AVX-NEXT: vmovaps %ymm0, 224(%rax)
15619 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15620 ; AVX-NEXT: vmovaps %ymm0, 192(%rax)
15621 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15622 ; AVX-NEXT: vmovaps %ymm0, 160(%rax)
15623 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15624 ; AVX-NEXT: vmovaps %ymm0, 128(%rax)
15625 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15626 ; AVX-NEXT: vmovaps %ymm0, 96(%rax)
15627 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15628 ; AVX-NEXT: vmovaps %ymm0, 64(%rax)
15629 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15630 ; AVX-NEXT: vmovaps %ymm0, 32(%rax)
15631 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15632 ; AVX-NEXT: vmovaps %ymm0, (%rax)
15633 ; AVX-NEXT: addq $5016, %rsp # imm = 0x1398
15634 ; AVX-NEXT: vzeroupper
15637 ; AVX2-LABEL: load_i64_stride8_vf64:
15639 ; AVX2-NEXT: subq $5064, %rsp # imm = 0x13C8
15640 ; AVX2-NEXT: vmovaps 320(%rdi), %xmm0
15641 ; AVX2-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0
15642 ; AVX2-NEXT: vmovaps 256(%rdi), %xmm1
15643 ; AVX2-NEXT: vinsertf128 $1, 384(%rdi), %ymm1, %ymm1
15644 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
15645 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15646 ; AVX2-NEXT: vmovaps 832(%rdi), %xmm2
15647 ; AVX2-NEXT: vinsertf128 $1, 960(%rdi), %ymm2, %ymm2
15648 ; AVX2-NEXT: vmovaps 768(%rdi), %xmm3
15649 ; AVX2-NEXT: vinsertf128 $1, 896(%rdi), %ymm3, %ymm3
15650 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
15651 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15652 ; AVX2-NEXT: vmovaps 1344(%rdi), %xmm4
15653 ; AVX2-NEXT: vinsertf128 $1, 1472(%rdi), %ymm4, %ymm4
15654 ; AVX2-NEXT: vmovaps 1280(%rdi), %xmm5
15655 ; AVX2-NEXT: vinsertf128 $1, 1408(%rdi), %ymm5, %ymm5
15656 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
15657 ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15658 ; AVX2-NEXT: vmovaps 1856(%rdi), %xmm6
15659 ; AVX2-NEXT: vinsertf128 $1, 1984(%rdi), %ymm6, %ymm6
15660 ; AVX2-NEXT: vmovaps 1792(%rdi), %xmm7
15661 ; AVX2-NEXT: vinsertf128 $1, 1920(%rdi), %ymm7, %ymm7
15662 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2]
15663 ; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15664 ; AVX2-NEXT: vmovaps 2368(%rdi), %xmm8
15665 ; AVX2-NEXT: vinsertf128 $1, 2496(%rdi), %ymm8, %ymm8
15666 ; AVX2-NEXT: vmovaps 2304(%rdi), %xmm9
15667 ; AVX2-NEXT: vinsertf128 $1, 2432(%rdi), %ymm9, %ymm9
15668 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
15669 ; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15670 ; AVX2-NEXT: vmovaps 2880(%rdi), %xmm10
15671 ; AVX2-NEXT: vinsertf128 $1, 3008(%rdi), %ymm10, %ymm10
15672 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
15673 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15674 ; AVX2-NEXT: vmovaps 2816(%rdi), %xmm0
15675 ; AVX2-NEXT: vinsertf128 $1, 2944(%rdi), %ymm0, %ymm0
15676 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
15677 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15678 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm10[0],ymm0[2],ymm10[2]
15679 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15680 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3]
15681 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15682 ; AVX2-NEXT: vmovaps 3392(%rdi), %xmm1
15683 ; AVX2-NEXT: vinsertf128 $1, 3520(%rdi), %ymm1, %ymm1
15684 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm7[1],ymm6[1],ymm7[3],ymm6[3]
15685 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15686 ; AVX2-NEXT: vmovaps 3328(%rdi), %xmm2
15687 ; AVX2-NEXT: vinsertf128 $1, 3456(%rdi), %ymm2, %ymm2
15688 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
15689 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15690 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
15691 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15692 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3]
15693 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15694 ; AVX2-NEXT: vmovaps 3904(%rdi), %xmm0
15695 ; AVX2-NEXT: vinsertf128 $1, 4032(%rdi), %ymm0, %ymm0
15696 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
15697 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15698 ; AVX2-NEXT: vmovaps 3840(%rdi), %xmm1
15699 ; AVX2-NEXT: vinsertf128 $1, 3968(%rdi), %ymm1, %ymm1
15700 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
15701 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15702 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
15703 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15704 ; AVX2-NEXT: vmovaps 64(%rdi), %xmm0
15705 ; AVX2-NEXT: vinsertf128 $1, 192(%rdi), %ymm0, %ymm0
15706 ; AVX2-NEXT: vmovaps (%rdi), %xmm1
15707 ; AVX2-NEXT: vinsertf128 $1, 128(%rdi), %ymm1, %ymm1
15708 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
15709 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15710 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
15711 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15712 ; AVX2-NEXT: vmovaps 576(%rdi), %xmm0
15713 ; AVX2-NEXT: vinsertf128 $1, 704(%rdi), %ymm0, %ymm0
15714 ; AVX2-NEXT: vmovaps 512(%rdi), %xmm1
15715 ; AVX2-NEXT: vinsertf128 $1, 640(%rdi), %ymm1, %ymm1
15716 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
15717 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15718 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
15719 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15720 ; AVX2-NEXT: vmovaps 1088(%rdi), %xmm0
15721 ; AVX2-NEXT: vinsertf128 $1, 1216(%rdi), %ymm0, %ymm0
15722 ; AVX2-NEXT: vmovaps 1024(%rdi), %xmm1
15723 ; AVX2-NEXT: vinsertf128 $1, 1152(%rdi), %ymm1, %ymm1
15724 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
15725 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15726 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
15727 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15728 ; AVX2-NEXT: vmovaps 1600(%rdi), %xmm0
15729 ; AVX2-NEXT: vinsertf128 $1, 1728(%rdi), %ymm0, %ymm0
15730 ; AVX2-NEXT: vmovaps 1536(%rdi), %xmm1
15731 ; AVX2-NEXT: vinsertf128 $1, 1664(%rdi), %ymm1, %ymm1
15732 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
15733 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15734 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
15735 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15736 ; AVX2-NEXT: vmovaps 2112(%rdi), %xmm0
15737 ; AVX2-NEXT: vinsertf128 $1, 2240(%rdi), %ymm0, %ymm0
15738 ; AVX2-NEXT: vmovaps 2048(%rdi), %xmm1
15739 ; AVX2-NEXT: vinsertf128 $1, 2176(%rdi), %ymm1, %ymm1
15740 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
15741 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15742 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
15743 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15744 ; AVX2-NEXT: vmovaps 2624(%rdi), %xmm0
15745 ; AVX2-NEXT: vinsertf128 $1, 2752(%rdi), %ymm0, %ymm0
15746 ; AVX2-NEXT: vmovaps 2560(%rdi), %xmm1
15747 ; AVX2-NEXT: vinsertf128 $1, 2688(%rdi), %ymm1, %ymm1
15748 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
15749 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15750 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
15751 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15752 ; AVX2-NEXT: vmovaps 3136(%rdi), %xmm0
15753 ; AVX2-NEXT: vinsertf128 $1, 3264(%rdi), %ymm0, %ymm0
15754 ; AVX2-NEXT: vmovaps 3072(%rdi), %xmm1
15755 ; AVX2-NEXT: vinsertf128 $1, 3200(%rdi), %ymm1, %ymm1
15756 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
15757 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15758 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
15759 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15760 ; AVX2-NEXT: vmovaps 3648(%rdi), %xmm0
15761 ; AVX2-NEXT: vinsertf128 $1, 3776(%rdi), %ymm0, %ymm0
15762 ; AVX2-NEXT: vmovaps 3584(%rdi), %xmm1
15763 ; AVX2-NEXT: vinsertf128 $1, 3712(%rdi), %ymm1, %ymm1
15764 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
15765 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15766 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
15767 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15768 ; AVX2-NEXT: vmovaps 320(%rdi), %ymm2
15769 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15770 ; AVX2-NEXT: vmovaps 256(%rdi), %ymm3
15771 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15772 ; AVX2-NEXT: vmovaps 448(%rdi), %ymm1
15773 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15774 ; AVX2-NEXT: vmovaps 384(%rdi), %ymm0
15775 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15776 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
15777 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
15778 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
15779 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15780 ; AVX2-NEXT: vmovaps 832(%rdi), %ymm2
15781 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15782 ; AVX2-NEXT: vmovaps 768(%rdi), %ymm3
15783 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15784 ; AVX2-NEXT: vmovaps 960(%rdi), %ymm1
15785 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15786 ; AVX2-NEXT: vmovaps 896(%rdi), %ymm0
15787 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15788 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
15789 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
15790 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
15791 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15792 ; AVX2-NEXT: vmovaps 1344(%rdi), %ymm2
15793 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15794 ; AVX2-NEXT: vmovaps 1280(%rdi), %ymm3
15795 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15796 ; AVX2-NEXT: vmovaps 1472(%rdi), %ymm1
15797 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15798 ; AVX2-NEXT: vmovaps 1408(%rdi), %ymm0
15799 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15800 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
15801 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
15802 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
15803 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15804 ; AVX2-NEXT: vmovaps 1856(%rdi), %ymm2
15805 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15806 ; AVX2-NEXT: vmovaps 1792(%rdi), %ymm3
15807 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15808 ; AVX2-NEXT: vmovaps 1984(%rdi), %ymm1
15809 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15810 ; AVX2-NEXT: vmovaps 1920(%rdi), %ymm0
15811 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15812 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
15813 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
15814 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
15815 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15816 ; AVX2-NEXT: vmovaps 2368(%rdi), %ymm2
15817 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15818 ; AVX2-NEXT: vmovaps 2304(%rdi), %ymm3
15819 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15820 ; AVX2-NEXT: vmovaps 2496(%rdi), %ymm1
15821 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15822 ; AVX2-NEXT: vmovaps 2432(%rdi), %ymm0
15823 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15824 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
15825 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
15826 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
15827 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15828 ; AVX2-NEXT: vmovaps 2880(%rdi), %ymm2
15829 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15830 ; AVX2-NEXT: vmovaps 2816(%rdi), %ymm3
15831 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15832 ; AVX2-NEXT: vmovaps 3008(%rdi), %ymm1
15833 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15834 ; AVX2-NEXT: vmovaps 2944(%rdi), %ymm0
15835 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15836 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
15837 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
15838 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
15839 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15840 ; AVX2-NEXT: vmovaps 3392(%rdi), %ymm2
15841 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15842 ; AVX2-NEXT: vmovaps 3328(%rdi), %ymm3
15843 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15844 ; AVX2-NEXT: vmovaps 3520(%rdi), %ymm1
15845 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15846 ; AVX2-NEXT: vmovaps 3456(%rdi), %ymm0
15847 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15848 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
15849 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
15850 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
15851 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15852 ; AVX2-NEXT: vmovaps 3904(%rdi), %ymm2
15853 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15854 ; AVX2-NEXT: vmovaps 3840(%rdi), %ymm3
15855 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15856 ; AVX2-NEXT: vmovaps 4032(%rdi), %ymm1
15857 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15858 ; AVX2-NEXT: vmovaps 3968(%rdi), %ymm0
15859 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15860 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
15861 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
15862 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
15863 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15864 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm2
15865 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15866 ; AVX2-NEXT: vmovaps (%rdi), %ymm3
15867 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15868 ; AVX2-NEXT: vmovaps 192(%rdi), %ymm1
15869 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15870 ; AVX2-NEXT: vmovaps 128(%rdi), %ymm0
15871 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15872 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
15873 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
15874 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
15875 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15876 ; AVX2-NEXT: vmovaps 576(%rdi), %ymm2
15877 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15878 ; AVX2-NEXT: vmovaps 512(%rdi), %ymm3
15879 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15880 ; AVX2-NEXT: vmovaps 704(%rdi), %ymm1
15881 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15882 ; AVX2-NEXT: vmovaps 640(%rdi), %ymm0
15883 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15884 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
15885 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
15886 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
15887 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15888 ; AVX2-NEXT: vmovaps 1088(%rdi), %ymm2
15889 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15890 ; AVX2-NEXT: vmovaps 1024(%rdi), %ymm3
15891 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15892 ; AVX2-NEXT: vmovaps 1216(%rdi), %ymm0
15893 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15894 ; AVX2-NEXT: vmovaps 1152(%rdi), %ymm15
15895 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[2],ymm0[2]
15896 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
15897 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
15898 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15899 ; AVX2-NEXT: vmovaps 1600(%rdi), %ymm2
15900 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15901 ; AVX2-NEXT: vmovaps 1536(%rdi), %ymm1
15902 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15903 ; AVX2-NEXT: vmovaps 1728(%rdi), %ymm14
15904 ; AVX2-NEXT: vmovaps 1664(%rdi), %ymm11
15905 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm14[0],ymm11[2],ymm14[2]
15906 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
15907 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
15908 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15909 ; AVX2-NEXT: vmovaps 2112(%rdi), %ymm2
15910 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15911 ; AVX2-NEXT: vmovaps 2048(%rdi), %ymm1
15912 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15913 ; AVX2-NEXT: vmovaps 2240(%rdi), %ymm10
15914 ; AVX2-NEXT: vmovaps 2176(%rdi), %ymm8
15915 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
15916 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
15917 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
15918 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15919 ; AVX2-NEXT: vmovaps 2624(%rdi), %ymm2
15920 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15921 ; AVX2-NEXT: vmovaps 2560(%rdi), %ymm1
15922 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15923 ; AVX2-NEXT: vmovaps 2752(%rdi), %ymm7
15924 ; AVX2-NEXT: vmovaps 2688(%rdi), %ymm5
15925 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm7[0],ymm5[2],ymm7[2]
15926 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
15927 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
15928 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15929 ; AVX2-NEXT: vmovaps 3136(%rdi), %ymm1
15930 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15931 ; AVX2-NEXT: vmovaps 3072(%rdi), %ymm9
15932 ; AVX2-NEXT: vmovaps 3264(%rdi), %ymm4
15933 ; AVX2-NEXT: vmovaps 3200(%rdi), %ymm3
15934 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
15935 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm9[0],ymm1[0],ymm9[2],ymm1[2]
15936 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3]
15937 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15938 ; AVX2-NEXT: vmovaps 3648(%rdi), %ymm12
15939 ; AVX2-NEXT: vmovaps 3584(%rdi), %ymm6
15940 ; AVX2-NEXT: vmovaps 3776(%rdi), %ymm2
15941 ; AVX2-NEXT: vmovaps 3712(%rdi), %ymm1
15942 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
15943 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm6[0],ymm12[0],ymm6[2],ymm12[2]
15944 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
15945 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15946 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15947 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15948 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
15949 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
15950 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
15951 ; AVX2-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
15952 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
15953 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15954 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15955 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15956 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
15957 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
15958 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
15959 ; AVX2-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
15960 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
15961 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15962 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15963 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15964 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
15965 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
15966 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
15967 ; AVX2-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
15968 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
15969 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15970 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15971 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15972 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
15973 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
15974 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
15975 ; AVX2-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
15976 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
15977 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15978 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload
15979 ; AVX2-NEXT: # ymm0 = ymm15[1],mem[1],ymm15[3],mem[3]
15980 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
15981 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
15982 ; AVX2-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
15983 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
15984 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15985 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15986 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15987 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
15988 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
15989 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
15990 ; AVX2-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
15991 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
15992 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15993 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm14[1],ymm11[3],ymm14[3]
15994 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
15995 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
15996 ; AVX2-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
15997 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3]
15998 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15999 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16000 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
16001 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
16002 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
16003 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
16004 ; AVX2-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
16005 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3]
16006 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16007 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
16008 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
16009 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
16010 ; AVX2-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3]
16011 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm0[2,3]
16012 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16013 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16014 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
16015 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
16016 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
16017 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
16018 ; AVX2-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3]
16019 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm0[2,3]
16020 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16021 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm7[1],ymm5[3],ymm7[3]
16022 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
16023 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
16024 ; AVX2-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
16025 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3]
16026 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16027 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16028 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
16029 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
16030 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
16031 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
16032 ; AVX2-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
16033 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3]
16034 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16035 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3]
16036 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
16037 ; AVX2-NEXT: # ymm3 = ymm9[1],mem[1],ymm9[3],mem[3]
16038 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
16039 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16040 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16041 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
16042 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
16043 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
16044 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
16045 ; AVX2-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3]
16046 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
16047 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16048 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
16049 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm6[1],ymm12[1],ymm6[3],ymm12[3]
16050 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
16051 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16052 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16053 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
16054 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
16055 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16056 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
16057 ; AVX2-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3]
16058 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
16059 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16060 ; AVX2-NEXT: vmovaps 96(%rdi), %xmm0
16061 ; AVX2-NEXT: vinsertf128 $1, 224(%rdi), %ymm0, %ymm0
16062 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm1
16063 ; AVX2-NEXT: vinsertf128 $1, 160(%rdi), %ymm1, %ymm1
16064 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
16065 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16066 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
16067 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16068 ; AVX2-NEXT: vmovaps 352(%rdi), %xmm0
16069 ; AVX2-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm0
16070 ; AVX2-NEXT: vmovaps 288(%rdi), %xmm1
16071 ; AVX2-NEXT: vinsertf128 $1, 416(%rdi), %ymm1, %ymm1
16072 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
16073 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16074 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
16075 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16076 ; AVX2-NEXT: vmovaps 608(%rdi), %xmm0
16077 ; AVX2-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0
16078 ; AVX2-NEXT: vmovaps 544(%rdi), %xmm1
16079 ; AVX2-NEXT: vinsertf128 $1, 672(%rdi), %ymm1, %ymm1
16080 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
16081 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16082 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
16083 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16084 ; AVX2-NEXT: vmovaps 864(%rdi), %xmm0
16085 ; AVX2-NEXT: vinsertf128 $1, 992(%rdi), %ymm0, %ymm0
16086 ; AVX2-NEXT: vmovaps 800(%rdi), %xmm1
16087 ; AVX2-NEXT: vinsertf128 $1, 928(%rdi), %ymm1, %ymm1
16088 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
16089 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16090 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
16091 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16092 ; AVX2-NEXT: vmovaps 1120(%rdi), %xmm0
16093 ; AVX2-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm0
16094 ; AVX2-NEXT: vmovaps 1056(%rdi), %xmm1
16095 ; AVX2-NEXT: vinsertf128 $1, 1184(%rdi), %ymm1, %ymm1
16096 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
16097 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16098 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
16099 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16100 ; AVX2-NEXT: vmovaps 1376(%rdi), %xmm0
16101 ; AVX2-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0
16102 ; AVX2-NEXT: vmovaps 1312(%rdi), %xmm1
16103 ; AVX2-NEXT: vinsertf128 $1, 1440(%rdi), %ymm1, %ymm1
16104 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
16105 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16106 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
16107 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16108 ; AVX2-NEXT: vmovaps 1632(%rdi), %xmm0
16109 ; AVX2-NEXT: vinsertf128 $1, 1760(%rdi), %ymm0, %ymm0
16110 ; AVX2-NEXT: vmovaps 1568(%rdi), %xmm1
16111 ; AVX2-NEXT: vinsertf128 $1, 1696(%rdi), %ymm1, %ymm1
16112 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
16113 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16114 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
16115 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16116 ; AVX2-NEXT: vmovaps 1888(%rdi), %xmm0
16117 ; AVX2-NEXT: vinsertf128 $1, 2016(%rdi), %ymm0, %ymm0
16118 ; AVX2-NEXT: vmovaps 1824(%rdi), %xmm1
16119 ; AVX2-NEXT: vinsertf128 $1, 1952(%rdi), %ymm1, %ymm1
16120 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
16121 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16122 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
16123 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16124 ; AVX2-NEXT: vmovaps 2144(%rdi), %xmm0
16125 ; AVX2-NEXT: vinsertf128 $1, 2272(%rdi), %ymm0, %ymm0
16126 ; AVX2-NEXT: vmovaps 2080(%rdi), %xmm1
16127 ; AVX2-NEXT: vinsertf128 $1, 2208(%rdi), %ymm1, %ymm1
16128 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
16129 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16130 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
16131 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16132 ; AVX2-NEXT: vmovaps 2400(%rdi), %xmm0
16133 ; AVX2-NEXT: vinsertf128 $1, 2528(%rdi), %ymm0, %ymm0
16134 ; AVX2-NEXT: vmovaps 2336(%rdi), %xmm1
16135 ; AVX2-NEXT: vinsertf128 $1, 2464(%rdi), %ymm1, %ymm1
16136 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
16137 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16138 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
16139 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16140 ; AVX2-NEXT: vmovaps 2656(%rdi), %xmm0
16141 ; AVX2-NEXT: vinsertf128 $1, 2784(%rdi), %ymm0, %ymm0
16142 ; AVX2-NEXT: vmovaps 2592(%rdi), %xmm1
16143 ; AVX2-NEXT: vinsertf128 $1, 2720(%rdi), %ymm1, %ymm1
16144 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
16145 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16146 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
16147 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16148 ; AVX2-NEXT: vmovaps 2912(%rdi), %xmm0
16149 ; AVX2-NEXT: vinsertf128 $1, 3040(%rdi), %ymm0, %ymm0
16150 ; AVX2-NEXT: vmovaps 2848(%rdi), %xmm1
16151 ; AVX2-NEXT: vinsertf128 $1, 2976(%rdi), %ymm1, %ymm1
16152 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
16153 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16154 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
16155 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16156 ; AVX2-NEXT: vmovaps 3168(%rdi), %xmm0
16157 ; AVX2-NEXT: vinsertf128 $1, 3296(%rdi), %ymm0, %ymm0
16158 ; AVX2-NEXT: vmovaps 3104(%rdi), %xmm1
16159 ; AVX2-NEXT: vinsertf128 $1, 3232(%rdi), %ymm1, %ymm1
16160 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
16161 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16162 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
16163 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16164 ; AVX2-NEXT: vmovaps 3424(%rdi), %xmm0
16165 ; AVX2-NEXT: vinsertf128 $1, 3552(%rdi), %ymm0, %ymm0
16166 ; AVX2-NEXT: vmovaps 3360(%rdi), %xmm1
16167 ; AVX2-NEXT: vinsertf128 $1, 3488(%rdi), %ymm1, %ymm1
16168 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
16169 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16170 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
16171 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16172 ; AVX2-NEXT: vmovaps 3680(%rdi), %xmm0
16173 ; AVX2-NEXT: vinsertf128 $1, 3808(%rdi), %ymm0, %ymm0
16174 ; AVX2-NEXT: vmovaps 3616(%rdi), %xmm1
16175 ; AVX2-NEXT: vinsertf128 $1, 3744(%rdi), %ymm1, %ymm1
16176 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
16177 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16178 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
16179 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16180 ; AVX2-NEXT: vmovaps 3936(%rdi), %xmm0
16181 ; AVX2-NEXT: vinsertf128 $1, 4064(%rdi), %ymm0, %ymm0
16182 ; AVX2-NEXT: vmovaps 3872(%rdi), %xmm1
16183 ; AVX2-NEXT: vinsertf128 $1, 4000(%rdi), %ymm1, %ymm1
16184 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
16185 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16186 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
16187 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16188 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm2
16189 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16190 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm3
16191 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16192 ; AVX2-NEXT: vmovaps 224(%rdi), %ymm1
16193 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16194 ; AVX2-NEXT: vmovaps 160(%rdi), %ymm0
16195 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16196 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
16197 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
16198 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
16199 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16200 ; AVX2-NEXT: vmovaps 352(%rdi), %ymm2
16201 ; AVX2-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill
16202 ; AVX2-NEXT: vmovaps 288(%rdi), %ymm3
16203 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16204 ; AVX2-NEXT: vmovaps 480(%rdi), %ymm1
16205 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16206 ; AVX2-NEXT: vmovaps 416(%rdi), %ymm0
16207 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16208 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
16209 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
16210 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
16211 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16212 ; AVX2-NEXT: vmovaps 608(%rdi), %ymm2
16213 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16214 ; AVX2-NEXT: vmovaps 544(%rdi), %ymm3
16215 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16216 ; AVX2-NEXT: vmovaps 736(%rdi), %ymm1
16217 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16218 ; AVX2-NEXT: vmovaps 672(%rdi), %ymm0
16219 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16220 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
16221 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
16222 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
16223 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16224 ; AVX2-NEXT: vmovaps 864(%rdi), %ymm2
16225 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16226 ; AVX2-NEXT: vmovaps 800(%rdi), %ymm3
16227 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16228 ; AVX2-NEXT: vmovaps 992(%rdi), %ymm1
16229 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16230 ; AVX2-NEXT: vmovaps 928(%rdi), %ymm0
16231 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16232 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
16233 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
16234 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
16235 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16236 ; AVX2-NEXT: vmovaps 1120(%rdi), %ymm2
16237 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16238 ; AVX2-NEXT: vmovaps 1056(%rdi), %ymm3
16239 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16240 ; AVX2-NEXT: vmovaps 1248(%rdi), %ymm1
16241 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16242 ; AVX2-NEXT: vmovaps 1184(%rdi), %ymm0
16243 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16244 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
16245 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
16246 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
16247 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16248 ; AVX2-NEXT: vmovaps 1376(%rdi), %ymm2
16249 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16250 ; AVX2-NEXT: vmovaps 1312(%rdi), %ymm3
16251 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16252 ; AVX2-NEXT: vmovaps 1504(%rdi), %ymm1
16253 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16254 ; AVX2-NEXT: vmovaps 1440(%rdi), %ymm0
16255 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16256 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
16257 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
16258 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
16259 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16260 ; AVX2-NEXT: vmovaps 1632(%rdi), %ymm2
16261 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16262 ; AVX2-NEXT: vmovaps 1568(%rdi), %ymm3
16263 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16264 ; AVX2-NEXT: vmovaps 1760(%rdi), %ymm1
16265 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16266 ; AVX2-NEXT: vmovaps 1696(%rdi), %ymm0
16267 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16268 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
16269 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
16270 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
16271 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16272 ; AVX2-NEXT: vmovaps 1888(%rdi), %ymm2
16273 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16274 ; AVX2-NEXT: vmovaps 1824(%rdi), %ymm3
16275 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16276 ; AVX2-NEXT: vmovaps 2016(%rdi), %ymm1
16277 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16278 ; AVX2-NEXT: vmovaps 1952(%rdi), %ymm0
16279 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16280 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
16281 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
16282 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
16283 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16284 ; AVX2-NEXT: vmovaps 2144(%rdi), %ymm2
16285 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16286 ; AVX2-NEXT: vmovaps 2080(%rdi), %ymm3
16287 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16288 ; AVX2-NEXT: vmovaps 2272(%rdi), %ymm1
16289 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16290 ; AVX2-NEXT: vmovaps 2208(%rdi), %ymm0
16291 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16292 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
16293 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
16294 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
16295 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16296 ; AVX2-NEXT: vmovaps 2400(%rdi), %ymm2
16297 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16298 ; AVX2-NEXT: vmovaps 2336(%rdi), %ymm3
16299 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16300 ; AVX2-NEXT: vmovaps 2528(%rdi), %ymm1
16301 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16302 ; AVX2-NEXT: vmovaps 2464(%rdi), %ymm0
16303 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16304 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
16305 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
16306 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
16307 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16308 ; AVX2-NEXT: vmovaps 2656(%rdi), %ymm2
16309 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16310 ; AVX2-NEXT: vmovaps 2592(%rdi), %ymm3
16311 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16312 ; AVX2-NEXT: vmovaps 2784(%rdi), %ymm1
16313 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16314 ; AVX2-NEXT: vmovaps 2720(%rdi), %ymm0
16315 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16316 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
16317 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
16318 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
16319 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16320 ; AVX2-NEXT: vmovaps 2912(%rdi), %ymm2
16321 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16322 ; AVX2-NEXT: vmovaps 2848(%rdi), %ymm3
16323 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16324 ; AVX2-NEXT: vmovaps 3040(%rdi), %ymm1
16325 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16326 ; AVX2-NEXT: vmovaps 2976(%rdi), %ymm0
16327 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16328 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
16329 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
16330 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
16331 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16332 ; AVX2-NEXT: vmovaps 3168(%rdi), %ymm2
16333 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16334 ; AVX2-NEXT: vmovaps 3104(%rdi), %ymm1
16335 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16336 ; AVX2-NEXT: vmovaps 3296(%rdi), %ymm14
16337 ; AVX2-NEXT: vmovaps 3232(%rdi), %ymm13
16338 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm14[0],ymm13[2],ymm14[2]
16339 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
16340 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
16341 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16342 ; AVX2-NEXT: vmovaps 3424(%rdi), %ymm12
16343 ; AVX2-NEXT: vmovaps 3360(%rdi), %ymm11
16344 ; AVX2-NEXT: vmovaps 3552(%rdi), %ymm10
16345 ; AVX2-NEXT: vmovaps 3488(%rdi), %ymm9
16346 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm10[0],ymm9[2],ymm10[2]
16347 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm11[0],ymm12[0],ymm11[2],ymm12[2]
16348 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
16349 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16350 ; AVX2-NEXT: vmovaps 3680(%rdi), %ymm8
16351 ; AVX2-NEXT: vmovaps 3616(%rdi), %ymm7
16352 ; AVX2-NEXT: vmovaps 3808(%rdi), %ymm6
16353 ; AVX2-NEXT: vmovaps 3744(%rdi), %ymm5
16354 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm6[0],ymm5[2],ymm6[2]
16355 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],ymm8[0],ymm7[2],ymm8[2]
16356 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
16357 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16358 ; AVX2-NEXT: vmovaps 3936(%rdi), %ymm4
16359 ; AVX2-NEXT: vmovaps 3872(%rdi), %ymm3
16360 ; AVX2-NEXT: vmovaps 4064(%rdi), %ymm2
16361 ; AVX2-NEXT: vmovaps 4000(%rdi), %ymm1
16362 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
16363 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
16364 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
16365 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16366 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16367 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
16368 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
16369 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
16370 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
16371 ; AVX2-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
16372 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
16373 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16374 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16375 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
16376 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
16377 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
16378 ; AVX2-NEXT: vunpckhpd (%rsp), %ymm15, %ymm15 # 32-byte Folded Reload
16379 ; AVX2-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
16380 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
16381 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16382 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16383 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
16384 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
16385 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
16386 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
16387 ; AVX2-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
16388 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
16389 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16390 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16391 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
16392 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
16393 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
16394 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
16395 ; AVX2-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
16396 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
16397 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16398 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16399 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
16400 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
16401 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
16402 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
16403 ; AVX2-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
16404 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
16405 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16406 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16407 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
16408 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
16409 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
16410 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
16411 ; AVX2-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
16412 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
16413 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16414 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16415 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
16416 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
16417 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
16418 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
16419 ; AVX2-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
16420 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
16421 ; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
16422 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16423 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
16424 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
16425 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
16426 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
16427 ; AVX2-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
16428 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
16429 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16430 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16431 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
16432 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
16433 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
16434 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
16435 ; AVX2-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
16436 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
16437 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16438 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16439 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
16440 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
16441 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
16442 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
16443 ; AVX2-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
16444 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
16445 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16446 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16447 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
16448 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
16449 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
16450 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
16451 ; AVX2-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
16452 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
16453 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16454 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16455 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
16456 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
16457 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
16458 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
16459 ; AVX2-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
16460 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm15[2,3],ymm0[2,3]
16461 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm14[1],ymm13[3],ymm14[3]
16462 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
16463 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
16464 ; AVX2-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
16465 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm0[2,3]
16466 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm10[1],ymm9[3],ymm10[3]
16467 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm11[1],ymm12[1],ymm11[3],ymm12[3]
16468 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm0[2,3]
16469 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm6[1],ymm5[3],ymm6[3]
16470 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm7[1],ymm8[1],ymm7[3],ymm8[3]
16471 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm0[2,3]
16472 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
16473 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm4[1],ymm3[3],ymm4[3]
16474 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
16475 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16476 ; AVX2-NEXT: vmovaps %ymm1, 448(%rsi)
16477 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16478 ; AVX2-NEXT: vmovaps %ymm1, 384(%rsi)
16479 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16480 ; AVX2-NEXT: vmovaps %ymm1, 320(%rsi)
16481 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16482 ; AVX2-NEXT: vmovaps %ymm1, 256(%rsi)
16483 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16484 ; AVX2-NEXT: vmovaps %ymm1, 192(%rsi)
16485 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16486 ; AVX2-NEXT: vmovaps %ymm1, 128(%rsi)
16487 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16488 ; AVX2-NEXT: vmovaps %ymm1, 64(%rsi)
16489 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16490 ; AVX2-NEXT: vmovaps %ymm1, (%rsi)
16491 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16492 ; AVX2-NEXT: vmovaps %ymm1, 480(%rsi)
16493 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16494 ; AVX2-NEXT: vmovaps %ymm1, 416(%rsi)
16495 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16496 ; AVX2-NEXT: vmovaps %ymm1, 352(%rsi)
16497 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16498 ; AVX2-NEXT: vmovaps %ymm1, 288(%rsi)
16499 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16500 ; AVX2-NEXT: vmovaps %ymm1, 224(%rsi)
16501 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16502 ; AVX2-NEXT: vmovaps %ymm1, 160(%rsi)
16503 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16504 ; AVX2-NEXT: vmovaps %ymm1, 96(%rsi)
16505 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16506 ; AVX2-NEXT: vmovaps %ymm1, 32(%rsi)
16507 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16508 ; AVX2-NEXT: vmovaps %ymm1, 448(%rdx)
16509 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16510 ; AVX2-NEXT: vmovaps %ymm1, 384(%rdx)
16511 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16512 ; AVX2-NEXT: vmovaps %ymm1, 320(%rdx)
16513 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16514 ; AVX2-NEXT: vmovaps %ymm1, 256(%rdx)
16515 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16516 ; AVX2-NEXT: vmovaps %ymm1, 192(%rdx)
16517 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16518 ; AVX2-NEXT: vmovaps %ymm1, 128(%rdx)
16519 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16520 ; AVX2-NEXT: vmovaps %ymm1, 64(%rdx)
16521 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16522 ; AVX2-NEXT: vmovaps %ymm1, (%rdx)
16523 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16524 ; AVX2-NEXT: vmovaps %ymm1, 480(%rdx)
16525 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16526 ; AVX2-NEXT: vmovaps %ymm1, 416(%rdx)
16527 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16528 ; AVX2-NEXT: vmovaps %ymm1, 352(%rdx)
16529 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16530 ; AVX2-NEXT: vmovaps %ymm1, 288(%rdx)
16531 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16532 ; AVX2-NEXT: vmovaps %ymm1, 224(%rdx)
16533 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16534 ; AVX2-NEXT: vmovaps %ymm1, 160(%rdx)
16535 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16536 ; AVX2-NEXT: vmovaps %ymm1, 96(%rdx)
16537 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16538 ; AVX2-NEXT: vmovaps %ymm1, 32(%rdx)
16539 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16540 ; AVX2-NEXT: vmovaps %ymm1, 448(%rcx)
16541 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16542 ; AVX2-NEXT: vmovaps %ymm1, 384(%rcx)
16543 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16544 ; AVX2-NEXT: vmovaps %ymm1, 320(%rcx)
16545 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16546 ; AVX2-NEXT: vmovaps %ymm1, 256(%rcx)
16547 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16548 ; AVX2-NEXT: vmovaps %ymm1, 192(%rcx)
16549 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16550 ; AVX2-NEXT: vmovaps %ymm1, 128(%rcx)
16551 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16552 ; AVX2-NEXT: vmovaps %ymm1, 64(%rcx)
16553 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16554 ; AVX2-NEXT: vmovaps %ymm1, (%rcx)
16555 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16556 ; AVX2-NEXT: vmovaps %ymm1, 480(%rcx)
16557 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16558 ; AVX2-NEXT: vmovaps %ymm1, 416(%rcx)
16559 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16560 ; AVX2-NEXT: vmovaps %ymm1, 352(%rcx)
16561 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16562 ; AVX2-NEXT: vmovaps %ymm1, 288(%rcx)
16563 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16564 ; AVX2-NEXT: vmovaps %ymm1, 224(%rcx)
16565 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16566 ; AVX2-NEXT: vmovaps %ymm1, 160(%rcx)
16567 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16568 ; AVX2-NEXT: vmovaps %ymm1, 96(%rcx)
16569 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16570 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx)
16571 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16572 ; AVX2-NEXT: vmovaps %ymm1, 480(%r8)
16573 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16574 ; AVX2-NEXT: vmovaps %ymm1, 448(%r8)
16575 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16576 ; AVX2-NEXT: vmovaps %ymm1, 416(%r8)
16577 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16578 ; AVX2-NEXT: vmovaps %ymm1, 384(%r8)
16579 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16580 ; AVX2-NEXT: vmovaps %ymm1, 352(%r8)
16581 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16582 ; AVX2-NEXT: vmovaps %ymm1, 320(%r8)
16583 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16584 ; AVX2-NEXT: vmovaps %ymm1, 288(%r8)
16585 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16586 ; AVX2-NEXT: vmovaps %ymm1, 256(%r8)
16587 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16588 ; AVX2-NEXT: vmovaps %ymm1, 224(%r8)
16589 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16590 ; AVX2-NEXT: vmovaps %ymm1, 192(%r8)
16591 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16592 ; AVX2-NEXT: vmovaps %ymm1, 160(%r8)
16593 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16594 ; AVX2-NEXT: vmovaps %ymm1, 128(%r8)
16595 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16596 ; AVX2-NEXT: vmovaps %ymm1, 96(%r8)
16597 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16598 ; AVX2-NEXT: vmovaps %ymm1, 64(%r8)
16599 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16600 ; AVX2-NEXT: vmovaps %ymm1, 32(%r8)
16601 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16602 ; AVX2-NEXT: vmovaps %ymm1, (%r8)
16603 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16604 ; AVX2-NEXT: vmovaps %ymm1, 480(%r9)
16605 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16606 ; AVX2-NEXT: vmovaps %ymm1, 448(%r9)
16607 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16608 ; AVX2-NEXT: vmovaps %ymm1, 416(%r9)
16609 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16610 ; AVX2-NEXT: vmovaps %ymm1, 384(%r9)
16611 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16612 ; AVX2-NEXT: vmovaps %ymm1, 352(%r9)
16613 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16614 ; AVX2-NEXT: vmovaps %ymm1, 320(%r9)
16615 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16616 ; AVX2-NEXT: vmovaps %ymm1, 288(%r9)
16617 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16618 ; AVX2-NEXT: vmovaps %ymm1, 256(%r9)
16619 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16620 ; AVX2-NEXT: vmovaps %ymm1, 224(%r9)
16621 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16622 ; AVX2-NEXT: vmovaps %ymm1, 192(%r9)
16623 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16624 ; AVX2-NEXT: vmovaps %ymm1, 160(%r9)
16625 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16626 ; AVX2-NEXT: vmovaps %ymm1, 128(%r9)
16627 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16628 ; AVX2-NEXT: vmovaps %ymm1, 96(%r9)
16629 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16630 ; AVX2-NEXT: vmovaps %ymm1, 64(%r9)
16631 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16632 ; AVX2-NEXT: vmovaps %ymm1, 32(%r9)
16633 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16634 ; AVX2-NEXT: vmovaps %ymm1, (%r9)
16635 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
16636 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16637 ; AVX2-NEXT: vmovaps %ymm1, 480(%rax)
16638 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16639 ; AVX2-NEXT: vmovaps %ymm1, 448(%rax)
16640 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16641 ; AVX2-NEXT: vmovaps %ymm1, 416(%rax)
16642 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16643 ; AVX2-NEXT: vmovaps %ymm1, 384(%rax)
16644 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16645 ; AVX2-NEXT: vmovaps %ymm1, 352(%rax)
16646 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16647 ; AVX2-NEXT: vmovaps %ymm1, 320(%rax)
16648 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16649 ; AVX2-NEXT: vmovaps %ymm1, 288(%rax)
16650 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16651 ; AVX2-NEXT: vmovaps %ymm1, 256(%rax)
16652 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16653 ; AVX2-NEXT: vmovaps %ymm1, 224(%rax)
16654 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16655 ; AVX2-NEXT: vmovaps %ymm1, 192(%rax)
16656 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16657 ; AVX2-NEXT: vmovaps %ymm1, 160(%rax)
16658 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16659 ; AVX2-NEXT: vmovaps %ymm1, 128(%rax)
16660 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16661 ; AVX2-NEXT: vmovaps %ymm1, 96(%rax)
16662 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16663 ; AVX2-NEXT: vmovaps %ymm1, 64(%rax)
16664 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16665 ; AVX2-NEXT: vmovaps %ymm1, 32(%rax)
16666 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16667 ; AVX2-NEXT: vmovaps %ymm1, (%rax)
16668 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
16669 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16670 ; AVX2-NEXT: vmovaps %ymm1, 480(%rax)
16671 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16672 ; AVX2-NEXT: vmovaps %ymm1, 448(%rax)
16673 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16674 ; AVX2-NEXT: vmovaps %ymm1, 416(%rax)
16675 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16676 ; AVX2-NEXT: vmovaps %ymm1, 384(%rax)
16677 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16678 ; AVX2-NEXT: vmovaps %ymm1, 352(%rax)
16679 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16680 ; AVX2-NEXT: vmovaps %ymm1, 320(%rax)
16681 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16682 ; AVX2-NEXT: vmovaps %ymm1, 288(%rax)
16683 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16684 ; AVX2-NEXT: vmovaps %ymm1, 256(%rax)
16685 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16686 ; AVX2-NEXT: vmovaps %ymm1, 224(%rax)
16687 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16688 ; AVX2-NEXT: vmovaps %ymm1, 192(%rax)
16689 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16690 ; AVX2-NEXT: vmovaps %ymm1, 160(%rax)
16691 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16692 ; AVX2-NEXT: vmovaps %ymm1, 128(%rax)
16693 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16694 ; AVX2-NEXT: vmovaps %ymm1, 96(%rax)
16695 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16696 ; AVX2-NEXT: vmovaps %ymm1, 64(%rax)
16697 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16698 ; AVX2-NEXT: vmovaps %ymm1, 32(%rax)
16699 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16700 ; AVX2-NEXT: vmovaps %ymm1, (%rax)
16701 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
16702 ; AVX2-NEXT: vmovaps %ymm0, 480(%rax)
16703 ; AVX2-NEXT: vmovaps %ymm5, 448(%rax)
16704 ; AVX2-NEXT: vmovaps %ymm9, 416(%rax)
16705 ; AVX2-NEXT: vmovaps %ymm13, 384(%rax)
16706 ; AVX2-NEXT: vmovaps %ymm15, 352(%rax)
16707 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16708 ; AVX2-NEXT: vmovaps %ymm0, 320(%rax)
16709 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16710 ; AVX2-NEXT: vmovaps %ymm0, 288(%rax)
16711 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16712 ; AVX2-NEXT: vmovaps %ymm0, 256(%rax)
16713 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16714 ; AVX2-NEXT: vmovaps %ymm0, 224(%rax)
16715 ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
16716 ; AVX2-NEXT: vmovaps %ymm0, 192(%rax)
16717 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16718 ; AVX2-NEXT: vmovaps %ymm0, 160(%rax)
16719 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16720 ; AVX2-NEXT: vmovaps %ymm0, 128(%rax)
16721 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16722 ; AVX2-NEXT: vmovaps %ymm0, 96(%rax)
16723 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16724 ; AVX2-NEXT: vmovaps %ymm0, 64(%rax)
16725 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16726 ; AVX2-NEXT: vmovaps %ymm0, 32(%rax)
16727 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16728 ; AVX2-NEXT: vmovaps %ymm0, (%rax)
16729 ; AVX2-NEXT: addq $5064, %rsp # imm = 0x13C8
16730 ; AVX2-NEXT: vzeroupper
16733 ; AVX2-FP-LABEL: load_i64_stride8_vf64:
16734 ; AVX2-FP: # %bb.0:
16735 ; AVX2-FP-NEXT: subq $5064, %rsp # imm = 0x13C8
16736 ; AVX2-FP-NEXT: vmovaps 320(%rdi), %xmm0
16737 ; AVX2-FP-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0
16738 ; AVX2-FP-NEXT: vmovaps 256(%rdi), %xmm1
16739 ; AVX2-FP-NEXT: vinsertf128 $1, 384(%rdi), %ymm1, %ymm1
16740 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
16741 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16742 ; AVX2-FP-NEXT: vmovaps 832(%rdi), %xmm2
16743 ; AVX2-FP-NEXT: vinsertf128 $1, 960(%rdi), %ymm2, %ymm2
16744 ; AVX2-FP-NEXT: vmovaps 768(%rdi), %xmm3
16745 ; AVX2-FP-NEXT: vinsertf128 $1, 896(%rdi), %ymm3, %ymm3
16746 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
16747 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16748 ; AVX2-FP-NEXT: vmovaps 1344(%rdi), %xmm4
16749 ; AVX2-FP-NEXT: vinsertf128 $1, 1472(%rdi), %ymm4, %ymm4
16750 ; AVX2-FP-NEXT: vmovaps 1280(%rdi), %xmm5
16751 ; AVX2-FP-NEXT: vinsertf128 $1, 1408(%rdi), %ymm5, %ymm5
16752 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
16753 ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16754 ; AVX2-FP-NEXT: vmovaps 1856(%rdi), %xmm6
16755 ; AVX2-FP-NEXT: vinsertf128 $1, 1984(%rdi), %ymm6, %ymm6
16756 ; AVX2-FP-NEXT: vmovaps 1792(%rdi), %xmm7
16757 ; AVX2-FP-NEXT: vinsertf128 $1, 1920(%rdi), %ymm7, %ymm7
16758 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2]
16759 ; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16760 ; AVX2-FP-NEXT: vmovaps 2368(%rdi), %xmm8
16761 ; AVX2-FP-NEXT: vinsertf128 $1, 2496(%rdi), %ymm8, %ymm8
16762 ; AVX2-FP-NEXT: vmovaps 2304(%rdi), %xmm9
16763 ; AVX2-FP-NEXT: vinsertf128 $1, 2432(%rdi), %ymm9, %ymm9
16764 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
16765 ; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16766 ; AVX2-FP-NEXT: vmovaps 2880(%rdi), %xmm10
16767 ; AVX2-FP-NEXT: vinsertf128 $1, 3008(%rdi), %ymm10, %ymm10
16768 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
16769 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16770 ; AVX2-FP-NEXT: vmovaps 2816(%rdi), %xmm0
16771 ; AVX2-FP-NEXT: vinsertf128 $1, 2944(%rdi), %ymm0, %ymm0
16772 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
16773 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16774 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm10[0],ymm0[2],ymm10[2]
16775 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16776 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3]
16777 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16778 ; AVX2-FP-NEXT: vmovaps 3392(%rdi), %xmm1
16779 ; AVX2-FP-NEXT: vinsertf128 $1, 3520(%rdi), %ymm1, %ymm1
16780 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm7[1],ymm6[1],ymm7[3],ymm6[3]
16781 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16782 ; AVX2-FP-NEXT: vmovaps 3328(%rdi), %xmm2
16783 ; AVX2-FP-NEXT: vinsertf128 $1, 3456(%rdi), %ymm2, %ymm2
16784 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
16785 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16786 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
16787 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16788 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3]
16789 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16790 ; AVX2-FP-NEXT: vmovaps 3904(%rdi), %xmm0
16791 ; AVX2-FP-NEXT: vinsertf128 $1, 4032(%rdi), %ymm0, %ymm0
16792 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
16793 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16794 ; AVX2-FP-NEXT: vmovaps 3840(%rdi), %xmm1
16795 ; AVX2-FP-NEXT: vinsertf128 $1, 3968(%rdi), %ymm1, %ymm1
16796 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
16797 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16798 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
16799 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16800 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm0
16801 ; AVX2-FP-NEXT: vinsertf128 $1, 192(%rdi), %ymm0, %ymm0
16802 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm1
16803 ; AVX2-FP-NEXT: vinsertf128 $1, 128(%rdi), %ymm1, %ymm1
16804 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
16805 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16806 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
16807 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16808 ; AVX2-FP-NEXT: vmovaps 576(%rdi), %xmm0
16809 ; AVX2-FP-NEXT: vinsertf128 $1, 704(%rdi), %ymm0, %ymm0
16810 ; AVX2-FP-NEXT: vmovaps 512(%rdi), %xmm1
16811 ; AVX2-FP-NEXT: vinsertf128 $1, 640(%rdi), %ymm1, %ymm1
16812 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
16813 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16814 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
16815 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16816 ; AVX2-FP-NEXT: vmovaps 1088(%rdi), %xmm0
16817 ; AVX2-FP-NEXT: vinsertf128 $1, 1216(%rdi), %ymm0, %ymm0
16818 ; AVX2-FP-NEXT: vmovaps 1024(%rdi), %xmm1
16819 ; AVX2-FP-NEXT: vinsertf128 $1, 1152(%rdi), %ymm1, %ymm1
16820 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
16821 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16822 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
16823 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16824 ; AVX2-FP-NEXT: vmovaps 1600(%rdi), %xmm0
16825 ; AVX2-FP-NEXT: vinsertf128 $1, 1728(%rdi), %ymm0, %ymm0
16826 ; AVX2-FP-NEXT: vmovaps 1536(%rdi), %xmm1
16827 ; AVX2-FP-NEXT: vinsertf128 $1, 1664(%rdi), %ymm1, %ymm1
16828 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
16829 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16830 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
16831 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16832 ; AVX2-FP-NEXT: vmovaps 2112(%rdi), %xmm0
16833 ; AVX2-FP-NEXT: vinsertf128 $1, 2240(%rdi), %ymm0, %ymm0
16834 ; AVX2-FP-NEXT: vmovaps 2048(%rdi), %xmm1
16835 ; AVX2-FP-NEXT: vinsertf128 $1, 2176(%rdi), %ymm1, %ymm1
16836 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
16837 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16838 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
16839 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16840 ; AVX2-FP-NEXT: vmovaps 2624(%rdi), %xmm0
16841 ; AVX2-FP-NEXT: vinsertf128 $1, 2752(%rdi), %ymm0, %ymm0
16842 ; AVX2-FP-NEXT: vmovaps 2560(%rdi), %xmm1
16843 ; AVX2-FP-NEXT: vinsertf128 $1, 2688(%rdi), %ymm1, %ymm1
16844 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
16845 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16846 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
16847 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16848 ; AVX2-FP-NEXT: vmovaps 3136(%rdi), %xmm0
16849 ; AVX2-FP-NEXT: vinsertf128 $1, 3264(%rdi), %ymm0, %ymm0
16850 ; AVX2-FP-NEXT: vmovaps 3072(%rdi), %xmm1
16851 ; AVX2-FP-NEXT: vinsertf128 $1, 3200(%rdi), %ymm1, %ymm1
16852 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
16853 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16854 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
16855 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16856 ; AVX2-FP-NEXT: vmovaps 3648(%rdi), %xmm0
16857 ; AVX2-FP-NEXT: vinsertf128 $1, 3776(%rdi), %ymm0, %ymm0
16858 ; AVX2-FP-NEXT: vmovaps 3584(%rdi), %xmm1
16859 ; AVX2-FP-NEXT: vinsertf128 $1, 3712(%rdi), %ymm1, %ymm1
16860 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
16861 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16862 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
16863 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16864 ; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm2
16865 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16866 ; AVX2-FP-NEXT: vmovaps 256(%rdi), %ymm3
16867 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16868 ; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm1
16869 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16870 ; AVX2-FP-NEXT: vmovaps 384(%rdi), %ymm0
16871 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16872 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
16873 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
16874 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
16875 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16876 ; AVX2-FP-NEXT: vmovaps 832(%rdi), %ymm2
16877 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16878 ; AVX2-FP-NEXT: vmovaps 768(%rdi), %ymm3
16879 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16880 ; AVX2-FP-NEXT: vmovaps 960(%rdi), %ymm1
16881 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16882 ; AVX2-FP-NEXT: vmovaps 896(%rdi), %ymm0
16883 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16884 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
16885 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
16886 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
16887 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16888 ; AVX2-FP-NEXT: vmovaps 1344(%rdi), %ymm2
16889 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16890 ; AVX2-FP-NEXT: vmovaps 1280(%rdi), %ymm3
16891 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16892 ; AVX2-FP-NEXT: vmovaps 1472(%rdi), %ymm1
16893 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16894 ; AVX2-FP-NEXT: vmovaps 1408(%rdi), %ymm0
16895 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16896 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
16897 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
16898 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
16899 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16900 ; AVX2-FP-NEXT: vmovaps 1856(%rdi), %ymm2
16901 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16902 ; AVX2-FP-NEXT: vmovaps 1792(%rdi), %ymm3
16903 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16904 ; AVX2-FP-NEXT: vmovaps 1984(%rdi), %ymm1
16905 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16906 ; AVX2-FP-NEXT: vmovaps 1920(%rdi), %ymm0
16907 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16908 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
16909 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
16910 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
16911 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16912 ; AVX2-FP-NEXT: vmovaps 2368(%rdi), %ymm2
16913 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16914 ; AVX2-FP-NEXT: vmovaps 2304(%rdi), %ymm3
16915 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16916 ; AVX2-FP-NEXT: vmovaps 2496(%rdi), %ymm1
16917 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16918 ; AVX2-FP-NEXT: vmovaps 2432(%rdi), %ymm0
16919 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16920 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
16921 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
16922 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
16923 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16924 ; AVX2-FP-NEXT: vmovaps 2880(%rdi), %ymm2
16925 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16926 ; AVX2-FP-NEXT: vmovaps 2816(%rdi), %ymm3
16927 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16928 ; AVX2-FP-NEXT: vmovaps 3008(%rdi), %ymm1
16929 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16930 ; AVX2-FP-NEXT: vmovaps 2944(%rdi), %ymm0
16931 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16932 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
16933 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
16934 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
16935 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16936 ; AVX2-FP-NEXT: vmovaps 3392(%rdi), %ymm2
16937 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16938 ; AVX2-FP-NEXT: vmovaps 3328(%rdi), %ymm3
16939 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16940 ; AVX2-FP-NEXT: vmovaps 3520(%rdi), %ymm1
16941 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16942 ; AVX2-FP-NEXT: vmovaps 3456(%rdi), %ymm0
16943 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16944 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
16945 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
16946 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
16947 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16948 ; AVX2-FP-NEXT: vmovaps 3904(%rdi), %ymm2
16949 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16950 ; AVX2-FP-NEXT: vmovaps 3840(%rdi), %ymm3
16951 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16952 ; AVX2-FP-NEXT: vmovaps 4032(%rdi), %ymm1
16953 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16954 ; AVX2-FP-NEXT: vmovaps 3968(%rdi), %ymm0
16955 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16956 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
16957 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
16958 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
16959 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16960 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm2
16961 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16962 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm3
16963 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16964 ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm1
16965 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16966 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm0
16967 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16968 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
16969 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
16970 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
16971 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16972 ; AVX2-FP-NEXT: vmovaps 576(%rdi), %ymm2
16973 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16974 ; AVX2-FP-NEXT: vmovaps 512(%rdi), %ymm3
16975 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16976 ; AVX2-FP-NEXT: vmovaps 704(%rdi), %ymm1
16977 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16978 ; AVX2-FP-NEXT: vmovaps 640(%rdi), %ymm0
16979 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16980 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
16981 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
16982 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
16983 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16984 ; AVX2-FP-NEXT: vmovaps 1088(%rdi), %ymm2
16985 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16986 ; AVX2-FP-NEXT: vmovaps 1024(%rdi), %ymm3
16987 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16988 ; AVX2-FP-NEXT: vmovaps 1216(%rdi), %ymm0
16989 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16990 ; AVX2-FP-NEXT: vmovaps 1152(%rdi), %ymm15
16991 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[2],ymm0[2]
16992 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
16993 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
16994 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16995 ; AVX2-FP-NEXT: vmovaps 1600(%rdi), %ymm2
16996 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16997 ; AVX2-FP-NEXT: vmovaps 1536(%rdi), %ymm1
16998 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16999 ; AVX2-FP-NEXT: vmovaps 1728(%rdi), %ymm14
17000 ; AVX2-FP-NEXT: vmovaps 1664(%rdi), %ymm11
17001 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm14[0],ymm11[2],ymm14[2]
17002 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
17003 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
17004 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17005 ; AVX2-FP-NEXT: vmovaps 2112(%rdi), %ymm2
17006 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17007 ; AVX2-FP-NEXT: vmovaps 2048(%rdi), %ymm1
17008 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17009 ; AVX2-FP-NEXT: vmovaps 2240(%rdi), %ymm10
17010 ; AVX2-FP-NEXT: vmovaps 2176(%rdi), %ymm8
17011 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
17012 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
17013 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
17014 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17015 ; AVX2-FP-NEXT: vmovaps 2624(%rdi), %ymm2
17016 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17017 ; AVX2-FP-NEXT: vmovaps 2560(%rdi), %ymm1
17018 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17019 ; AVX2-FP-NEXT: vmovaps 2752(%rdi), %ymm7
17020 ; AVX2-FP-NEXT: vmovaps 2688(%rdi), %ymm5
17021 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm7[0],ymm5[2],ymm7[2]
17022 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
17023 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
17024 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17025 ; AVX2-FP-NEXT: vmovaps 3136(%rdi), %ymm1
17026 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17027 ; AVX2-FP-NEXT: vmovaps 3072(%rdi), %ymm9
17028 ; AVX2-FP-NEXT: vmovaps 3264(%rdi), %ymm4
17029 ; AVX2-FP-NEXT: vmovaps 3200(%rdi), %ymm3
17030 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
17031 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm9[0],ymm1[0],ymm9[2],ymm1[2]
17032 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3]
17033 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17034 ; AVX2-FP-NEXT: vmovaps 3648(%rdi), %ymm12
17035 ; AVX2-FP-NEXT: vmovaps 3584(%rdi), %ymm6
17036 ; AVX2-FP-NEXT: vmovaps 3776(%rdi), %ymm2
17037 ; AVX2-FP-NEXT: vmovaps 3712(%rdi), %ymm1
17038 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
17039 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm6[0],ymm12[0],ymm6[2],ymm12[2]
17040 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
17041 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17042 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17043 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
17044 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
17045 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
17046 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
17047 ; AVX2-FP-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
17048 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
17049 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17050 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17051 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
17052 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
17053 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
17054 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
17055 ; AVX2-FP-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
17056 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
17057 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17058 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17059 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
17060 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
17061 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
17062 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
17063 ; AVX2-FP-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
17064 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
17065 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17066 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17067 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
17068 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
17069 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
17070 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
17071 ; AVX2-FP-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
17072 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
17073 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17074 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload
17075 ; AVX2-FP-NEXT: # ymm0 = ymm15[1],mem[1],ymm15[3],mem[3]
17076 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
17077 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
17078 ; AVX2-FP-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
17079 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
17080 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17081 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17082 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
17083 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
17084 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
17085 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
17086 ; AVX2-FP-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
17087 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
17088 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17089 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm14[1],ymm11[3],ymm14[3]
17090 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
17091 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
17092 ; AVX2-FP-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
17093 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3]
17094 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17095 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17096 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
17097 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
17098 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
17099 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
17100 ; AVX2-FP-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
17101 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3]
17102 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17103 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
17104 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
17105 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
17106 ; AVX2-FP-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3]
17107 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm0[2,3]
17108 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17109 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17110 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
17111 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
17112 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
17113 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
17114 ; AVX2-FP-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3]
17115 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm0[2,3]
17116 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17117 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm7[1],ymm5[3],ymm7[3]
17118 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
17119 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
17120 ; AVX2-FP-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
17121 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3]
17122 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17123 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17124 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
17125 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
17126 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
17127 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
17128 ; AVX2-FP-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
17129 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3]
17130 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17131 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3]
17132 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
17133 ; AVX2-FP-NEXT: # ymm3 = ymm9[1],mem[1],ymm9[3],mem[3]
17134 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
17135 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17136 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17137 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
17138 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
17139 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
17140 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
17141 ; AVX2-FP-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3]
17142 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
17143 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17144 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
17145 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm6[1],ymm12[1],ymm6[3],ymm12[3]
17146 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
17147 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17148 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17149 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
17150 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
17151 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17152 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
17153 ; AVX2-FP-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3]
17154 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
17155 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17156 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm0
17157 ; AVX2-FP-NEXT: vinsertf128 $1, 224(%rdi), %ymm0, %ymm0
17158 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm1
17159 ; AVX2-FP-NEXT: vinsertf128 $1, 160(%rdi), %ymm1, %ymm1
17160 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
17161 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17162 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
17163 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17164 ; AVX2-FP-NEXT: vmovaps 352(%rdi), %xmm0
17165 ; AVX2-FP-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm0
17166 ; AVX2-FP-NEXT: vmovaps 288(%rdi), %xmm1
17167 ; AVX2-FP-NEXT: vinsertf128 $1, 416(%rdi), %ymm1, %ymm1
17168 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
17169 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17170 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
17171 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17172 ; AVX2-FP-NEXT: vmovaps 608(%rdi), %xmm0
17173 ; AVX2-FP-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0
17174 ; AVX2-FP-NEXT: vmovaps 544(%rdi), %xmm1
17175 ; AVX2-FP-NEXT: vinsertf128 $1, 672(%rdi), %ymm1, %ymm1
17176 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
17177 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17178 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
17179 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17180 ; AVX2-FP-NEXT: vmovaps 864(%rdi), %xmm0
17181 ; AVX2-FP-NEXT: vinsertf128 $1, 992(%rdi), %ymm0, %ymm0
17182 ; AVX2-FP-NEXT: vmovaps 800(%rdi), %xmm1
17183 ; AVX2-FP-NEXT: vinsertf128 $1, 928(%rdi), %ymm1, %ymm1
17184 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
17185 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17186 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
17187 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17188 ; AVX2-FP-NEXT: vmovaps 1120(%rdi), %xmm0
17189 ; AVX2-FP-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm0
17190 ; AVX2-FP-NEXT: vmovaps 1056(%rdi), %xmm1
17191 ; AVX2-FP-NEXT: vinsertf128 $1, 1184(%rdi), %ymm1, %ymm1
17192 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
17193 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17194 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
17195 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17196 ; AVX2-FP-NEXT: vmovaps 1376(%rdi), %xmm0
17197 ; AVX2-FP-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0
17198 ; AVX2-FP-NEXT: vmovaps 1312(%rdi), %xmm1
17199 ; AVX2-FP-NEXT: vinsertf128 $1, 1440(%rdi), %ymm1, %ymm1
17200 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
17201 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17202 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
17203 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17204 ; AVX2-FP-NEXT: vmovaps 1632(%rdi), %xmm0
17205 ; AVX2-FP-NEXT: vinsertf128 $1, 1760(%rdi), %ymm0, %ymm0
17206 ; AVX2-FP-NEXT: vmovaps 1568(%rdi), %xmm1
17207 ; AVX2-FP-NEXT: vinsertf128 $1, 1696(%rdi), %ymm1, %ymm1
17208 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
17209 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17210 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
17211 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17212 ; AVX2-FP-NEXT: vmovaps 1888(%rdi), %xmm0
17213 ; AVX2-FP-NEXT: vinsertf128 $1, 2016(%rdi), %ymm0, %ymm0
17214 ; AVX2-FP-NEXT: vmovaps 1824(%rdi), %xmm1
17215 ; AVX2-FP-NEXT: vinsertf128 $1, 1952(%rdi), %ymm1, %ymm1
17216 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
17217 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17218 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
17219 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17220 ; AVX2-FP-NEXT: vmovaps 2144(%rdi), %xmm0
17221 ; AVX2-FP-NEXT: vinsertf128 $1, 2272(%rdi), %ymm0, %ymm0
17222 ; AVX2-FP-NEXT: vmovaps 2080(%rdi), %xmm1
17223 ; AVX2-FP-NEXT: vinsertf128 $1, 2208(%rdi), %ymm1, %ymm1
17224 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
17225 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17226 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
17227 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17228 ; AVX2-FP-NEXT: vmovaps 2400(%rdi), %xmm0
17229 ; AVX2-FP-NEXT: vinsertf128 $1, 2528(%rdi), %ymm0, %ymm0
17230 ; AVX2-FP-NEXT: vmovaps 2336(%rdi), %xmm1
17231 ; AVX2-FP-NEXT: vinsertf128 $1, 2464(%rdi), %ymm1, %ymm1
17232 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
17233 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17234 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
17235 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17236 ; AVX2-FP-NEXT: vmovaps 2656(%rdi), %xmm0
17237 ; AVX2-FP-NEXT: vinsertf128 $1, 2784(%rdi), %ymm0, %ymm0
17238 ; AVX2-FP-NEXT: vmovaps 2592(%rdi), %xmm1
17239 ; AVX2-FP-NEXT: vinsertf128 $1, 2720(%rdi), %ymm1, %ymm1
17240 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
17241 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17242 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
17243 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17244 ; AVX2-FP-NEXT: vmovaps 2912(%rdi), %xmm0
17245 ; AVX2-FP-NEXT: vinsertf128 $1, 3040(%rdi), %ymm0, %ymm0
17246 ; AVX2-FP-NEXT: vmovaps 2848(%rdi), %xmm1
17247 ; AVX2-FP-NEXT: vinsertf128 $1, 2976(%rdi), %ymm1, %ymm1
17248 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
17249 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17250 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
17251 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17252 ; AVX2-FP-NEXT: vmovaps 3168(%rdi), %xmm0
17253 ; AVX2-FP-NEXT: vinsertf128 $1, 3296(%rdi), %ymm0, %ymm0
17254 ; AVX2-FP-NEXT: vmovaps 3104(%rdi), %xmm1
17255 ; AVX2-FP-NEXT: vinsertf128 $1, 3232(%rdi), %ymm1, %ymm1
17256 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
17257 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17258 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
17259 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17260 ; AVX2-FP-NEXT: vmovaps 3424(%rdi), %xmm0
17261 ; AVX2-FP-NEXT: vinsertf128 $1, 3552(%rdi), %ymm0, %ymm0
17262 ; AVX2-FP-NEXT: vmovaps 3360(%rdi), %xmm1
17263 ; AVX2-FP-NEXT: vinsertf128 $1, 3488(%rdi), %ymm1, %ymm1
17264 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
17265 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17266 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
17267 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17268 ; AVX2-FP-NEXT: vmovaps 3680(%rdi), %xmm0
17269 ; AVX2-FP-NEXT: vinsertf128 $1, 3808(%rdi), %ymm0, %ymm0
17270 ; AVX2-FP-NEXT: vmovaps 3616(%rdi), %xmm1
17271 ; AVX2-FP-NEXT: vinsertf128 $1, 3744(%rdi), %ymm1, %ymm1
17272 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
17273 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17274 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
17275 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17276 ; AVX2-FP-NEXT: vmovaps 3936(%rdi), %xmm0
17277 ; AVX2-FP-NEXT: vinsertf128 $1, 4064(%rdi), %ymm0, %ymm0
17278 ; AVX2-FP-NEXT: vmovaps 3872(%rdi), %xmm1
17279 ; AVX2-FP-NEXT: vinsertf128 $1, 4000(%rdi), %ymm1, %ymm1
17280 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
17281 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17282 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
17283 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17284 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm2
17285 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17286 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm3
17287 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17288 ; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm1
17289 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17290 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm0
17291 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17292 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
17293 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
17294 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
17295 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17296 ; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm2
17297 ; AVX2-FP-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill
17298 ; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm3
17299 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17300 ; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm1
17301 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17302 ; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm0
17303 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17304 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
17305 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
17306 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
17307 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17308 ; AVX2-FP-NEXT: vmovaps 608(%rdi), %ymm2
17309 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17310 ; AVX2-FP-NEXT: vmovaps 544(%rdi), %ymm3
17311 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17312 ; AVX2-FP-NEXT: vmovaps 736(%rdi), %ymm1
17313 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17314 ; AVX2-FP-NEXT: vmovaps 672(%rdi), %ymm0
17315 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17316 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
17317 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
17318 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
17319 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17320 ; AVX2-FP-NEXT: vmovaps 864(%rdi), %ymm2
17321 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17322 ; AVX2-FP-NEXT: vmovaps 800(%rdi), %ymm3
17323 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17324 ; AVX2-FP-NEXT: vmovaps 992(%rdi), %ymm1
17325 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17326 ; AVX2-FP-NEXT: vmovaps 928(%rdi), %ymm0
17327 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17328 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
17329 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
17330 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
17331 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17332 ; AVX2-FP-NEXT: vmovaps 1120(%rdi), %ymm2
17333 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17334 ; AVX2-FP-NEXT: vmovaps 1056(%rdi), %ymm3
17335 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17336 ; AVX2-FP-NEXT: vmovaps 1248(%rdi), %ymm1
17337 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17338 ; AVX2-FP-NEXT: vmovaps 1184(%rdi), %ymm0
17339 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17340 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
17341 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
17342 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
17343 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17344 ; AVX2-FP-NEXT: vmovaps 1376(%rdi), %ymm2
17345 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17346 ; AVX2-FP-NEXT: vmovaps 1312(%rdi), %ymm3
17347 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17348 ; AVX2-FP-NEXT: vmovaps 1504(%rdi), %ymm1
17349 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17350 ; AVX2-FP-NEXT: vmovaps 1440(%rdi), %ymm0
17351 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17352 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
17353 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
17354 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
17355 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17356 ; AVX2-FP-NEXT: vmovaps 1632(%rdi), %ymm2
17357 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17358 ; AVX2-FP-NEXT: vmovaps 1568(%rdi), %ymm3
17359 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17360 ; AVX2-FP-NEXT: vmovaps 1760(%rdi), %ymm1
17361 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17362 ; AVX2-FP-NEXT: vmovaps 1696(%rdi), %ymm0
17363 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17364 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
17365 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
17366 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
17367 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17368 ; AVX2-FP-NEXT: vmovaps 1888(%rdi), %ymm2
17369 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17370 ; AVX2-FP-NEXT: vmovaps 1824(%rdi), %ymm3
17371 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17372 ; AVX2-FP-NEXT: vmovaps 2016(%rdi), %ymm1
17373 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17374 ; AVX2-FP-NEXT: vmovaps 1952(%rdi), %ymm0
17375 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17376 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
17377 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
17378 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
17379 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17380 ; AVX2-FP-NEXT: vmovaps 2144(%rdi), %ymm2
17381 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17382 ; AVX2-FP-NEXT: vmovaps 2080(%rdi), %ymm3
17383 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17384 ; AVX2-FP-NEXT: vmovaps 2272(%rdi), %ymm1
17385 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17386 ; AVX2-FP-NEXT: vmovaps 2208(%rdi), %ymm0
17387 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17388 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
17389 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
17390 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
17391 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17392 ; AVX2-FP-NEXT: vmovaps 2400(%rdi), %ymm2
17393 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17394 ; AVX2-FP-NEXT: vmovaps 2336(%rdi), %ymm3
17395 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17396 ; AVX2-FP-NEXT: vmovaps 2528(%rdi), %ymm1
17397 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17398 ; AVX2-FP-NEXT: vmovaps 2464(%rdi), %ymm0
17399 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17400 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
17401 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
17402 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
17403 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17404 ; AVX2-FP-NEXT: vmovaps 2656(%rdi), %ymm2
17405 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17406 ; AVX2-FP-NEXT: vmovaps 2592(%rdi), %ymm3
17407 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17408 ; AVX2-FP-NEXT: vmovaps 2784(%rdi), %ymm1
17409 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17410 ; AVX2-FP-NEXT: vmovaps 2720(%rdi), %ymm0
17411 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17412 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
17413 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
17414 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
17415 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17416 ; AVX2-FP-NEXT: vmovaps 2912(%rdi), %ymm2
17417 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17418 ; AVX2-FP-NEXT: vmovaps 2848(%rdi), %ymm3
17419 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17420 ; AVX2-FP-NEXT: vmovaps 3040(%rdi), %ymm1
17421 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17422 ; AVX2-FP-NEXT: vmovaps 2976(%rdi), %ymm0
17423 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17424 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
17425 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
17426 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
17427 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17428 ; AVX2-FP-NEXT: vmovaps 3168(%rdi), %ymm2
17429 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17430 ; AVX2-FP-NEXT: vmovaps 3104(%rdi), %ymm1
17431 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17432 ; AVX2-FP-NEXT: vmovaps 3296(%rdi), %ymm14
17433 ; AVX2-FP-NEXT: vmovaps 3232(%rdi), %ymm13
17434 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm14[0],ymm13[2],ymm14[2]
17435 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
17436 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
17437 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17438 ; AVX2-FP-NEXT: vmovaps 3424(%rdi), %ymm12
17439 ; AVX2-FP-NEXT: vmovaps 3360(%rdi), %ymm11
17440 ; AVX2-FP-NEXT: vmovaps 3552(%rdi), %ymm10
17441 ; AVX2-FP-NEXT: vmovaps 3488(%rdi), %ymm9
17442 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm10[0],ymm9[2],ymm10[2]
17443 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm11[0],ymm12[0],ymm11[2],ymm12[2]
17444 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
17445 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17446 ; AVX2-FP-NEXT: vmovaps 3680(%rdi), %ymm8
17447 ; AVX2-FP-NEXT: vmovaps 3616(%rdi), %ymm7
17448 ; AVX2-FP-NEXT: vmovaps 3808(%rdi), %ymm6
17449 ; AVX2-FP-NEXT: vmovaps 3744(%rdi), %ymm5
17450 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm6[0],ymm5[2],ymm6[2]
17451 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],ymm8[0],ymm7[2],ymm8[2]
17452 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
17453 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17454 ; AVX2-FP-NEXT: vmovaps 3936(%rdi), %ymm4
17455 ; AVX2-FP-NEXT: vmovaps 3872(%rdi), %ymm3
17456 ; AVX2-FP-NEXT: vmovaps 4064(%rdi), %ymm2
17457 ; AVX2-FP-NEXT: vmovaps 4000(%rdi), %ymm1
17458 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
17459 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
17460 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
17461 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17462 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17463 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
17464 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
17465 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
17466 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
17467 ; AVX2-FP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
17468 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
17469 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17470 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17471 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
17472 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
17473 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
17474 ; AVX2-FP-NEXT: vunpckhpd (%rsp), %ymm15, %ymm15 # 32-byte Folded Reload
17475 ; AVX2-FP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
17476 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
17477 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17478 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17479 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
17480 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
17481 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
17482 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
17483 ; AVX2-FP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
17484 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
17485 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17486 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17487 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
17488 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
17489 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
17490 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
17491 ; AVX2-FP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
17492 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
17493 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17494 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17495 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
17496 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
17497 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
17498 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
17499 ; AVX2-FP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
17500 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
17501 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17502 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17503 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
17504 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
17505 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
17506 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
17507 ; AVX2-FP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
17508 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
17509 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17510 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17511 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
17512 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
17513 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
17514 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
17515 ; AVX2-FP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
17516 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
17517 ; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
17518 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17519 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
17520 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
17521 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
17522 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
17523 ; AVX2-FP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
17524 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
17525 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17526 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17527 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
17528 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
17529 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
17530 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
17531 ; AVX2-FP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
17532 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
17533 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17534 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17535 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
17536 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
17537 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
17538 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
17539 ; AVX2-FP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
17540 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
17541 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17542 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17543 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
17544 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
17545 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
17546 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
17547 ; AVX2-FP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
17548 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
17549 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17550 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17551 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
17552 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
17553 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
17554 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
17555 ; AVX2-FP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
17556 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm15[2,3],ymm0[2,3]
17557 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm14[1],ymm13[3],ymm14[3]
17558 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
17559 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
17560 ; AVX2-FP-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
17561 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm0[2,3]
17562 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm10[1],ymm9[3],ymm10[3]
17563 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm11[1],ymm12[1],ymm11[3],ymm12[3]
17564 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm0[2,3]
17565 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm6[1],ymm5[3],ymm6[3]
17566 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm7[1],ymm8[1],ymm7[3],ymm8[3]
17567 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm0[2,3]
17568 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
17569 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm4[1],ymm3[3],ymm4[3]
17570 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
17571 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17572 ; AVX2-FP-NEXT: vmovaps %ymm1, 448(%rsi)
17573 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17574 ; AVX2-FP-NEXT: vmovaps %ymm1, 384(%rsi)
17575 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17576 ; AVX2-FP-NEXT: vmovaps %ymm1, 320(%rsi)
17577 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17578 ; AVX2-FP-NEXT: vmovaps %ymm1, 256(%rsi)
17579 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17580 ; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rsi)
17581 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17582 ; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rsi)
17583 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17584 ; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rsi)
17585 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17586 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rsi)
17587 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17588 ; AVX2-FP-NEXT: vmovaps %ymm1, 480(%rsi)
17589 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17590 ; AVX2-FP-NEXT: vmovaps %ymm1, 416(%rsi)
17591 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17592 ; AVX2-FP-NEXT: vmovaps %ymm1, 352(%rsi)
17593 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17594 ; AVX2-FP-NEXT: vmovaps %ymm1, 288(%rsi)
17595 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17596 ; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rsi)
17597 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17598 ; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rsi)
17599 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17600 ; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rsi)
17601 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17602 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rsi)
17603 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17604 ; AVX2-FP-NEXT: vmovaps %ymm1, 448(%rdx)
17605 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17606 ; AVX2-FP-NEXT: vmovaps %ymm1, 384(%rdx)
17607 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17608 ; AVX2-FP-NEXT: vmovaps %ymm1, 320(%rdx)
17609 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17610 ; AVX2-FP-NEXT: vmovaps %ymm1, 256(%rdx)
17611 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17612 ; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rdx)
17613 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17614 ; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rdx)
17615 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17616 ; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rdx)
17617 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17618 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rdx)
17619 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17620 ; AVX2-FP-NEXT: vmovaps %ymm1, 480(%rdx)
17621 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17622 ; AVX2-FP-NEXT: vmovaps %ymm1, 416(%rdx)
17623 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17624 ; AVX2-FP-NEXT: vmovaps %ymm1, 352(%rdx)
17625 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17626 ; AVX2-FP-NEXT: vmovaps %ymm1, 288(%rdx)
17627 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17628 ; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rdx)
17629 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17630 ; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rdx)
17631 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17632 ; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rdx)
17633 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17634 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rdx)
17635 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17636 ; AVX2-FP-NEXT: vmovaps %ymm1, 448(%rcx)
17637 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17638 ; AVX2-FP-NEXT: vmovaps %ymm1, 384(%rcx)
17639 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17640 ; AVX2-FP-NEXT: vmovaps %ymm1, 320(%rcx)
17641 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17642 ; AVX2-FP-NEXT: vmovaps %ymm1, 256(%rcx)
17643 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17644 ; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rcx)
17645 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17646 ; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rcx)
17647 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17648 ; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rcx)
17649 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17650 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rcx)
17651 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17652 ; AVX2-FP-NEXT: vmovaps %ymm1, 480(%rcx)
17653 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17654 ; AVX2-FP-NEXT: vmovaps %ymm1, 416(%rcx)
17655 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17656 ; AVX2-FP-NEXT: vmovaps %ymm1, 352(%rcx)
17657 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17658 ; AVX2-FP-NEXT: vmovaps %ymm1, 288(%rcx)
17659 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17660 ; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rcx)
17661 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17662 ; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rcx)
17663 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17664 ; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rcx)
17665 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17666 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rcx)
17667 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17668 ; AVX2-FP-NEXT: vmovaps %ymm1, 480(%r8)
17669 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17670 ; AVX2-FP-NEXT: vmovaps %ymm1, 448(%r8)
17671 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17672 ; AVX2-FP-NEXT: vmovaps %ymm1, 416(%r8)
17673 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17674 ; AVX2-FP-NEXT: vmovaps %ymm1, 384(%r8)
17675 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17676 ; AVX2-FP-NEXT: vmovaps %ymm1, 352(%r8)
17677 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17678 ; AVX2-FP-NEXT: vmovaps %ymm1, 320(%r8)
17679 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17680 ; AVX2-FP-NEXT: vmovaps %ymm1, 288(%r8)
17681 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17682 ; AVX2-FP-NEXT: vmovaps %ymm1, 256(%r8)
17683 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17684 ; AVX2-FP-NEXT: vmovaps %ymm1, 224(%r8)
17685 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17686 ; AVX2-FP-NEXT: vmovaps %ymm1, 192(%r8)
17687 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17688 ; AVX2-FP-NEXT: vmovaps %ymm1, 160(%r8)
17689 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17690 ; AVX2-FP-NEXT: vmovaps %ymm1, 128(%r8)
17691 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17692 ; AVX2-FP-NEXT: vmovaps %ymm1, 96(%r8)
17693 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17694 ; AVX2-FP-NEXT: vmovaps %ymm1, 64(%r8)
17695 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17696 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%r8)
17697 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17698 ; AVX2-FP-NEXT: vmovaps %ymm1, (%r8)
17699 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17700 ; AVX2-FP-NEXT: vmovaps %ymm1, 480(%r9)
17701 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17702 ; AVX2-FP-NEXT: vmovaps %ymm1, 448(%r9)
17703 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17704 ; AVX2-FP-NEXT: vmovaps %ymm1, 416(%r9)
17705 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17706 ; AVX2-FP-NEXT: vmovaps %ymm1, 384(%r9)
17707 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17708 ; AVX2-FP-NEXT: vmovaps %ymm1, 352(%r9)
17709 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17710 ; AVX2-FP-NEXT: vmovaps %ymm1, 320(%r9)
17711 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17712 ; AVX2-FP-NEXT: vmovaps %ymm1, 288(%r9)
17713 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17714 ; AVX2-FP-NEXT: vmovaps %ymm1, 256(%r9)
17715 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17716 ; AVX2-FP-NEXT: vmovaps %ymm1, 224(%r9)
17717 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17718 ; AVX2-FP-NEXT: vmovaps %ymm1, 192(%r9)
17719 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17720 ; AVX2-FP-NEXT: vmovaps %ymm1, 160(%r9)
17721 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17722 ; AVX2-FP-NEXT: vmovaps %ymm1, 128(%r9)
17723 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17724 ; AVX2-FP-NEXT: vmovaps %ymm1, 96(%r9)
17725 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17726 ; AVX2-FP-NEXT: vmovaps %ymm1, 64(%r9)
17727 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17728 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%r9)
17729 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17730 ; AVX2-FP-NEXT: vmovaps %ymm1, (%r9)
17731 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
17732 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17733 ; AVX2-FP-NEXT: vmovaps %ymm1, 480(%rax)
17734 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17735 ; AVX2-FP-NEXT: vmovaps %ymm1, 448(%rax)
17736 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17737 ; AVX2-FP-NEXT: vmovaps %ymm1, 416(%rax)
17738 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17739 ; AVX2-FP-NEXT: vmovaps %ymm1, 384(%rax)
17740 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17741 ; AVX2-FP-NEXT: vmovaps %ymm1, 352(%rax)
17742 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17743 ; AVX2-FP-NEXT: vmovaps %ymm1, 320(%rax)
17744 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17745 ; AVX2-FP-NEXT: vmovaps %ymm1, 288(%rax)
17746 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17747 ; AVX2-FP-NEXT: vmovaps %ymm1, 256(%rax)
17748 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17749 ; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rax)
17750 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17751 ; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rax)
17752 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17753 ; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rax)
17754 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17755 ; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rax)
17756 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17757 ; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rax)
17758 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17759 ; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rax)
17760 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17761 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rax)
17762 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17763 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rax)
17764 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
17765 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17766 ; AVX2-FP-NEXT: vmovaps %ymm1, 480(%rax)
17767 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17768 ; AVX2-FP-NEXT: vmovaps %ymm1, 448(%rax)
17769 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17770 ; AVX2-FP-NEXT: vmovaps %ymm1, 416(%rax)
17771 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17772 ; AVX2-FP-NEXT: vmovaps %ymm1, 384(%rax)
17773 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17774 ; AVX2-FP-NEXT: vmovaps %ymm1, 352(%rax)
17775 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17776 ; AVX2-FP-NEXT: vmovaps %ymm1, 320(%rax)
17777 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17778 ; AVX2-FP-NEXT: vmovaps %ymm1, 288(%rax)
17779 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17780 ; AVX2-FP-NEXT: vmovaps %ymm1, 256(%rax)
17781 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17782 ; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rax)
17783 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17784 ; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rax)
17785 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17786 ; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rax)
17787 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17788 ; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rax)
17789 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17790 ; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rax)
17791 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17792 ; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rax)
17793 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17794 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rax)
17795 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17796 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rax)
17797 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
17798 ; AVX2-FP-NEXT: vmovaps %ymm0, 480(%rax)
17799 ; AVX2-FP-NEXT: vmovaps %ymm5, 448(%rax)
17800 ; AVX2-FP-NEXT: vmovaps %ymm9, 416(%rax)
17801 ; AVX2-FP-NEXT: vmovaps %ymm13, 384(%rax)
17802 ; AVX2-FP-NEXT: vmovaps %ymm15, 352(%rax)
17803 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17804 ; AVX2-FP-NEXT: vmovaps %ymm0, 320(%rax)
17805 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17806 ; AVX2-FP-NEXT: vmovaps %ymm0, 288(%rax)
17807 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17808 ; AVX2-FP-NEXT: vmovaps %ymm0, 256(%rax)
17809 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17810 ; AVX2-FP-NEXT: vmovaps %ymm0, 224(%rax)
17811 ; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
17812 ; AVX2-FP-NEXT: vmovaps %ymm0, 192(%rax)
17813 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17814 ; AVX2-FP-NEXT: vmovaps %ymm0, 160(%rax)
17815 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17816 ; AVX2-FP-NEXT: vmovaps %ymm0, 128(%rax)
17817 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17818 ; AVX2-FP-NEXT: vmovaps %ymm0, 96(%rax)
17819 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17820 ; AVX2-FP-NEXT: vmovaps %ymm0, 64(%rax)
17821 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17822 ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rax)
17823 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17824 ; AVX2-FP-NEXT: vmovaps %ymm0, (%rax)
17825 ; AVX2-FP-NEXT: addq $5064, %rsp # imm = 0x13C8
17826 ; AVX2-FP-NEXT: vzeroupper
17827 ; AVX2-FP-NEXT: retq
17829 ; AVX2-FCP-LABEL: load_i64_stride8_vf64:
17830 ; AVX2-FCP: # %bb.0:
17831 ; AVX2-FCP-NEXT: subq $5064, %rsp # imm = 0x13C8
17832 ; AVX2-FCP-NEXT: vmovaps 320(%rdi), %xmm0
17833 ; AVX2-FCP-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0
17834 ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %xmm1
17835 ; AVX2-FCP-NEXT: vinsertf128 $1, 384(%rdi), %ymm1, %ymm1
17836 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
17837 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17838 ; AVX2-FCP-NEXT: vmovaps 832(%rdi), %xmm2
17839 ; AVX2-FCP-NEXT: vinsertf128 $1, 960(%rdi), %ymm2, %ymm2
17840 ; AVX2-FCP-NEXT: vmovaps 768(%rdi), %xmm3
17841 ; AVX2-FCP-NEXT: vinsertf128 $1, 896(%rdi), %ymm3, %ymm3
17842 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
17843 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17844 ; AVX2-FCP-NEXT: vmovaps 1344(%rdi), %xmm4
17845 ; AVX2-FCP-NEXT: vinsertf128 $1, 1472(%rdi), %ymm4, %ymm4
17846 ; AVX2-FCP-NEXT: vmovaps 1280(%rdi), %xmm5
17847 ; AVX2-FCP-NEXT: vinsertf128 $1, 1408(%rdi), %ymm5, %ymm5
17848 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
17849 ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17850 ; AVX2-FCP-NEXT: vmovaps 1856(%rdi), %xmm6
17851 ; AVX2-FCP-NEXT: vinsertf128 $1, 1984(%rdi), %ymm6, %ymm6
17852 ; AVX2-FCP-NEXT: vmovaps 1792(%rdi), %xmm7
17853 ; AVX2-FCP-NEXT: vinsertf128 $1, 1920(%rdi), %ymm7, %ymm7
17854 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2]
17855 ; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17856 ; AVX2-FCP-NEXT: vmovaps 2368(%rdi), %xmm8
17857 ; AVX2-FCP-NEXT: vinsertf128 $1, 2496(%rdi), %ymm8, %ymm8
17858 ; AVX2-FCP-NEXT: vmovaps 2304(%rdi), %xmm9
17859 ; AVX2-FCP-NEXT: vinsertf128 $1, 2432(%rdi), %ymm9, %ymm9
17860 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
17861 ; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17862 ; AVX2-FCP-NEXT: vmovaps 2880(%rdi), %xmm10
17863 ; AVX2-FCP-NEXT: vinsertf128 $1, 3008(%rdi), %ymm10, %ymm10
17864 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
17865 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17866 ; AVX2-FCP-NEXT: vmovaps 2816(%rdi), %xmm0
17867 ; AVX2-FCP-NEXT: vinsertf128 $1, 2944(%rdi), %ymm0, %ymm0
17868 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
17869 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17870 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm10[0],ymm0[2],ymm10[2]
17871 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17872 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3]
17873 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17874 ; AVX2-FCP-NEXT: vmovaps 3392(%rdi), %xmm1
17875 ; AVX2-FCP-NEXT: vinsertf128 $1, 3520(%rdi), %ymm1, %ymm1
17876 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm7[1],ymm6[1],ymm7[3],ymm6[3]
17877 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17878 ; AVX2-FCP-NEXT: vmovaps 3328(%rdi), %xmm2
17879 ; AVX2-FCP-NEXT: vinsertf128 $1, 3456(%rdi), %ymm2, %ymm2
17880 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
17881 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17882 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
17883 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17884 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3]
17885 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17886 ; AVX2-FCP-NEXT: vmovaps 3904(%rdi), %xmm0
17887 ; AVX2-FCP-NEXT: vinsertf128 $1, 4032(%rdi), %ymm0, %ymm0
17888 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
17889 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17890 ; AVX2-FCP-NEXT: vmovaps 3840(%rdi), %xmm1
17891 ; AVX2-FCP-NEXT: vinsertf128 $1, 3968(%rdi), %ymm1, %ymm1
17892 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
17893 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17894 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
17895 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17896 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm0
17897 ; AVX2-FCP-NEXT: vinsertf128 $1, 192(%rdi), %ymm0, %ymm0
17898 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm1
17899 ; AVX2-FCP-NEXT: vinsertf128 $1, 128(%rdi), %ymm1, %ymm1
17900 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
17901 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17902 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
17903 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17904 ; AVX2-FCP-NEXT: vmovaps 576(%rdi), %xmm0
17905 ; AVX2-FCP-NEXT: vinsertf128 $1, 704(%rdi), %ymm0, %ymm0
17906 ; AVX2-FCP-NEXT: vmovaps 512(%rdi), %xmm1
17907 ; AVX2-FCP-NEXT: vinsertf128 $1, 640(%rdi), %ymm1, %ymm1
17908 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
17909 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17910 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
17911 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17912 ; AVX2-FCP-NEXT: vmovaps 1088(%rdi), %xmm0
17913 ; AVX2-FCP-NEXT: vinsertf128 $1, 1216(%rdi), %ymm0, %ymm0
17914 ; AVX2-FCP-NEXT: vmovaps 1024(%rdi), %xmm1
17915 ; AVX2-FCP-NEXT: vinsertf128 $1, 1152(%rdi), %ymm1, %ymm1
17916 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
17917 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17918 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
17919 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17920 ; AVX2-FCP-NEXT: vmovaps 1600(%rdi), %xmm0
17921 ; AVX2-FCP-NEXT: vinsertf128 $1, 1728(%rdi), %ymm0, %ymm0
17922 ; AVX2-FCP-NEXT: vmovaps 1536(%rdi), %xmm1
17923 ; AVX2-FCP-NEXT: vinsertf128 $1, 1664(%rdi), %ymm1, %ymm1
17924 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
17925 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17926 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
17927 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17928 ; AVX2-FCP-NEXT: vmovaps 2112(%rdi), %xmm0
17929 ; AVX2-FCP-NEXT: vinsertf128 $1, 2240(%rdi), %ymm0, %ymm0
17930 ; AVX2-FCP-NEXT: vmovaps 2048(%rdi), %xmm1
17931 ; AVX2-FCP-NEXT: vinsertf128 $1, 2176(%rdi), %ymm1, %ymm1
17932 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
17933 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17934 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
17935 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17936 ; AVX2-FCP-NEXT: vmovaps 2624(%rdi), %xmm0
17937 ; AVX2-FCP-NEXT: vinsertf128 $1, 2752(%rdi), %ymm0, %ymm0
17938 ; AVX2-FCP-NEXT: vmovaps 2560(%rdi), %xmm1
17939 ; AVX2-FCP-NEXT: vinsertf128 $1, 2688(%rdi), %ymm1, %ymm1
17940 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
17941 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17942 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
17943 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17944 ; AVX2-FCP-NEXT: vmovaps 3136(%rdi), %xmm0
17945 ; AVX2-FCP-NEXT: vinsertf128 $1, 3264(%rdi), %ymm0, %ymm0
17946 ; AVX2-FCP-NEXT: vmovaps 3072(%rdi), %xmm1
17947 ; AVX2-FCP-NEXT: vinsertf128 $1, 3200(%rdi), %ymm1, %ymm1
17948 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
17949 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17950 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
17951 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17952 ; AVX2-FCP-NEXT: vmovaps 3648(%rdi), %xmm0
17953 ; AVX2-FCP-NEXT: vinsertf128 $1, 3776(%rdi), %ymm0, %ymm0
17954 ; AVX2-FCP-NEXT: vmovaps 3584(%rdi), %xmm1
17955 ; AVX2-FCP-NEXT: vinsertf128 $1, 3712(%rdi), %ymm1, %ymm1
17956 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
17957 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17958 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
17959 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17960 ; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm2
17961 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17962 ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm3
17963 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17964 ; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm1
17965 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17966 ; AVX2-FCP-NEXT: vmovaps 384(%rdi), %ymm0
17967 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17968 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
17969 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
17970 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
17971 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17972 ; AVX2-FCP-NEXT: vmovaps 832(%rdi), %ymm2
17973 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17974 ; AVX2-FCP-NEXT: vmovaps 768(%rdi), %ymm3
17975 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17976 ; AVX2-FCP-NEXT: vmovaps 960(%rdi), %ymm1
17977 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17978 ; AVX2-FCP-NEXT: vmovaps 896(%rdi), %ymm0
17979 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17980 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
17981 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
17982 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
17983 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17984 ; AVX2-FCP-NEXT: vmovaps 1344(%rdi), %ymm2
17985 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17986 ; AVX2-FCP-NEXT: vmovaps 1280(%rdi), %ymm3
17987 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17988 ; AVX2-FCP-NEXT: vmovaps 1472(%rdi), %ymm1
17989 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17990 ; AVX2-FCP-NEXT: vmovaps 1408(%rdi), %ymm0
17991 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17992 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
17993 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
17994 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
17995 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17996 ; AVX2-FCP-NEXT: vmovaps 1856(%rdi), %ymm2
17997 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17998 ; AVX2-FCP-NEXT: vmovaps 1792(%rdi), %ymm3
17999 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18000 ; AVX2-FCP-NEXT: vmovaps 1984(%rdi), %ymm1
18001 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18002 ; AVX2-FCP-NEXT: vmovaps 1920(%rdi), %ymm0
18003 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18004 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
18005 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
18006 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
18007 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18008 ; AVX2-FCP-NEXT: vmovaps 2368(%rdi), %ymm2
18009 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18010 ; AVX2-FCP-NEXT: vmovaps 2304(%rdi), %ymm3
18011 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18012 ; AVX2-FCP-NEXT: vmovaps 2496(%rdi), %ymm1
18013 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18014 ; AVX2-FCP-NEXT: vmovaps 2432(%rdi), %ymm0
18015 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18016 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
18017 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
18018 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
18019 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18020 ; AVX2-FCP-NEXT: vmovaps 2880(%rdi), %ymm2
18021 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18022 ; AVX2-FCP-NEXT: vmovaps 2816(%rdi), %ymm3
18023 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18024 ; AVX2-FCP-NEXT: vmovaps 3008(%rdi), %ymm1
18025 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18026 ; AVX2-FCP-NEXT: vmovaps 2944(%rdi), %ymm0
18027 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18028 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
18029 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
18030 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
18031 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18032 ; AVX2-FCP-NEXT: vmovaps 3392(%rdi), %ymm2
18033 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18034 ; AVX2-FCP-NEXT: vmovaps 3328(%rdi), %ymm3
18035 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18036 ; AVX2-FCP-NEXT: vmovaps 3520(%rdi), %ymm1
18037 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18038 ; AVX2-FCP-NEXT: vmovaps 3456(%rdi), %ymm0
18039 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18040 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
18041 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
18042 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
18043 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18044 ; AVX2-FCP-NEXT: vmovaps 3904(%rdi), %ymm2
18045 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18046 ; AVX2-FCP-NEXT: vmovaps 3840(%rdi), %ymm3
18047 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18048 ; AVX2-FCP-NEXT: vmovaps 4032(%rdi), %ymm1
18049 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18050 ; AVX2-FCP-NEXT: vmovaps 3968(%rdi), %ymm0
18051 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18052 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
18053 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
18054 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
18055 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18056 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm2
18057 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18058 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm3
18059 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18060 ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm1
18061 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18062 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm0
18063 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18064 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
18065 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
18066 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
18067 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18068 ; AVX2-FCP-NEXT: vmovaps 576(%rdi), %ymm2
18069 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18070 ; AVX2-FCP-NEXT: vmovaps 512(%rdi), %ymm3
18071 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18072 ; AVX2-FCP-NEXT: vmovaps 704(%rdi), %ymm1
18073 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18074 ; AVX2-FCP-NEXT: vmovaps 640(%rdi), %ymm0
18075 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18076 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
18077 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
18078 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
18079 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18080 ; AVX2-FCP-NEXT: vmovaps 1088(%rdi), %ymm2
18081 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18082 ; AVX2-FCP-NEXT: vmovaps 1024(%rdi), %ymm3
18083 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18084 ; AVX2-FCP-NEXT: vmovaps 1216(%rdi), %ymm0
18085 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18086 ; AVX2-FCP-NEXT: vmovaps 1152(%rdi), %ymm15
18087 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[2],ymm0[2]
18088 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
18089 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
18090 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18091 ; AVX2-FCP-NEXT: vmovaps 1600(%rdi), %ymm2
18092 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18093 ; AVX2-FCP-NEXT: vmovaps 1536(%rdi), %ymm1
18094 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18095 ; AVX2-FCP-NEXT: vmovaps 1728(%rdi), %ymm14
18096 ; AVX2-FCP-NEXT: vmovaps 1664(%rdi), %ymm11
18097 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm14[0],ymm11[2],ymm14[2]
18098 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
18099 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
18100 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18101 ; AVX2-FCP-NEXT: vmovaps 2112(%rdi), %ymm2
18102 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18103 ; AVX2-FCP-NEXT: vmovaps 2048(%rdi), %ymm1
18104 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18105 ; AVX2-FCP-NEXT: vmovaps 2240(%rdi), %ymm10
18106 ; AVX2-FCP-NEXT: vmovaps 2176(%rdi), %ymm8
18107 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
18108 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
18109 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
18110 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18111 ; AVX2-FCP-NEXT: vmovaps 2624(%rdi), %ymm2
18112 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18113 ; AVX2-FCP-NEXT: vmovaps 2560(%rdi), %ymm1
18114 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18115 ; AVX2-FCP-NEXT: vmovaps 2752(%rdi), %ymm7
18116 ; AVX2-FCP-NEXT: vmovaps 2688(%rdi), %ymm5
18117 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm7[0],ymm5[2],ymm7[2]
18118 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
18119 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
18120 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18121 ; AVX2-FCP-NEXT: vmovaps 3136(%rdi), %ymm1
18122 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18123 ; AVX2-FCP-NEXT: vmovaps 3072(%rdi), %ymm9
18124 ; AVX2-FCP-NEXT: vmovaps 3264(%rdi), %ymm4
18125 ; AVX2-FCP-NEXT: vmovaps 3200(%rdi), %ymm3
18126 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
18127 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm9[0],ymm1[0],ymm9[2],ymm1[2]
18128 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3]
18129 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18130 ; AVX2-FCP-NEXT: vmovaps 3648(%rdi), %ymm12
18131 ; AVX2-FCP-NEXT: vmovaps 3584(%rdi), %ymm6
18132 ; AVX2-FCP-NEXT: vmovaps 3776(%rdi), %ymm2
18133 ; AVX2-FCP-NEXT: vmovaps 3712(%rdi), %ymm1
18134 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
18135 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm6[0],ymm12[0],ymm6[2],ymm12[2]
18136 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
18137 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18138 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
18139 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
18140 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
18141 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
18142 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
18143 ; AVX2-FCP-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
18144 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
18145 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18146 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
18147 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
18148 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
18149 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
18150 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
18151 ; AVX2-FCP-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
18152 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
18153 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18154 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
18155 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
18156 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
18157 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
18158 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
18159 ; AVX2-FCP-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
18160 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
18161 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18162 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
18163 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
18164 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
18165 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
18166 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
18167 ; AVX2-FCP-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
18168 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
18169 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18170 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload
18171 ; AVX2-FCP-NEXT: # ymm0 = ymm15[1],mem[1],ymm15[3],mem[3]
18172 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
18173 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
18174 ; AVX2-FCP-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
18175 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
18176 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18177 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
18178 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
18179 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
18180 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
18181 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
18182 ; AVX2-FCP-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
18183 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
18184 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18185 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm14[1],ymm11[3],ymm14[3]
18186 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
18187 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
18188 ; AVX2-FCP-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
18189 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3]
18190 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18191 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
18192 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
18193 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
18194 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
18195 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
18196 ; AVX2-FCP-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
18197 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3]
18198 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18199 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
18200 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
18201 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
18202 ; AVX2-FCP-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3]
18203 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm0[2,3]
18204 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18205 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
18206 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
18207 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
18208 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
18209 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
18210 ; AVX2-FCP-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3]
18211 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm0[2,3]
18212 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18213 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm7[1],ymm5[3],ymm7[3]
18214 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
18215 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
18216 ; AVX2-FCP-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
18217 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3]
18218 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18219 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
18220 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
18221 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
18222 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
18223 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
18224 ; AVX2-FCP-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
18225 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3]
18226 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18227 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3]
18228 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
18229 ; AVX2-FCP-NEXT: # ymm3 = ymm9[1],mem[1],ymm9[3],mem[3]
18230 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
18231 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18232 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
18233 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
18234 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
18235 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
18236 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
18237 ; AVX2-FCP-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3]
18238 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
18239 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18240 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
18241 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm6[1],ymm12[1],ymm6[3],ymm12[3]
18242 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
18243 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18244 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
18245 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
18246 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
18247 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18248 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
18249 ; AVX2-FCP-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3]
18250 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
18251 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18252 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm0
18253 ; AVX2-FCP-NEXT: vinsertf128 $1, 224(%rdi), %ymm0, %ymm0
18254 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm1
18255 ; AVX2-FCP-NEXT: vinsertf128 $1, 160(%rdi), %ymm1, %ymm1
18256 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
18257 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18258 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
18259 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18260 ; AVX2-FCP-NEXT: vmovaps 352(%rdi), %xmm0
18261 ; AVX2-FCP-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm0
18262 ; AVX2-FCP-NEXT: vmovaps 288(%rdi), %xmm1
18263 ; AVX2-FCP-NEXT: vinsertf128 $1, 416(%rdi), %ymm1, %ymm1
18264 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
18265 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18266 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
18267 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18268 ; AVX2-FCP-NEXT: vmovaps 608(%rdi), %xmm0
18269 ; AVX2-FCP-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0
18270 ; AVX2-FCP-NEXT: vmovaps 544(%rdi), %xmm1
18271 ; AVX2-FCP-NEXT: vinsertf128 $1, 672(%rdi), %ymm1, %ymm1
18272 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
18273 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18274 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
18275 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18276 ; AVX2-FCP-NEXT: vmovaps 864(%rdi), %xmm0
18277 ; AVX2-FCP-NEXT: vinsertf128 $1, 992(%rdi), %ymm0, %ymm0
18278 ; AVX2-FCP-NEXT: vmovaps 800(%rdi), %xmm1
18279 ; AVX2-FCP-NEXT: vinsertf128 $1, 928(%rdi), %ymm1, %ymm1
18280 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
18281 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18282 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
18283 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18284 ; AVX2-FCP-NEXT: vmovaps 1120(%rdi), %xmm0
18285 ; AVX2-FCP-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm0
18286 ; AVX2-FCP-NEXT: vmovaps 1056(%rdi), %xmm1
18287 ; AVX2-FCP-NEXT: vinsertf128 $1, 1184(%rdi), %ymm1, %ymm1
18288 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
18289 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18290 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
18291 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18292 ; AVX2-FCP-NEXT: vmovaps 1376(%rdi), %xmm0
18293 ; AVX2-FCP-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0
18294 ; AVX2-FCP-NEXT: vmovaps 1312(%rdi), %xmm1
18295 ; AVX2-FCP-NEXT: vinsertf128 $1, 1440(%rdi), %ymm1, %ymm1
18296 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
18297 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18298 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
18299 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18300 ; AVX2-FCP-NEXT: vmovaps 1632(%rdi), %xmm0
18301 ; AVX2-FCP-NEXT: vinsertf128 $1, 1760(%rdi), %ymm0, %ymm0
18302 ; AVX2-FCP-NEXT: vmovaps 1568(%rdi), %xmm1
18303 ; AVX2-FCP-NEXT: vinsertf128 $1, 1696(%rdi), %ymm1, %ymm1
18304 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
18305 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18306 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
18307 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18308 ; AVX2-FCP-NEXT: vmovaps 1888(%rdi), %xmm0
18309 ; AVX2-FCP-NEXT: vinsertf128 $1, 2016(%rdi), %ymm0, %ymm0
18310 ; AVX2-FCP-NEXT: vmovaps 1824(%rdi), %xmm1
18311 ; AVX2-FCP-NEXT: vinsertf128 $1, 1952(%rdi), %ymm1, %ymm1
18312 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
18313 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18314 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
18315 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18316 ; AVX2-FCP-NEXT: vmovaps 2144(%rdi), %xmm0
18317 ; AVX2-FCP-NEXT: vinsertf128 $1, 2272(%rdi), %ymm0, %ymm0
18318 ; AVX2-FCP-NEXT: vmovaps 2080(%rdi), %xmm1
18319 ; AVX2-FCP-NEXT: vinsertf128 $1, 2208(%rdi), %ymm1, %ymm1
18320 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
18321 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18322 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
18323 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18324 ; AVX2-FCP-NEXT: vmovaps 2400(%rdi), %xmm0
18325 ; AVX2-FCP-NEXT: vinsertf128 $1, 2528(%rdi), %ymm0, %ymm0
18326 ; AVX2-FCP-NEXT: vmovaps 2336(%rdi), %xmm1
18327 ; AVX2-FCP-NEXT: vinsertf128 $1, 2464(%rdi), %ymm1, %ymm1
18328 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
18329 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18330 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
18331 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18332 ; AVX2-FCP-NEXT: vmovaps 2656(%rdi), %xmm0
18333 ; AVX2-FCP-NEXT: vinsertf128 $1, 2784(%rdi), %ymm0, %ymm0
18334 ; AVX2-FCP-NEXT: vmovaps 2592(%rdi), %xmm1
18335 ; AVX2-FCP-NEXT: vinsertf128 $1, 2720(%rdi), %ymm1, %ymm1
18336 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
18337 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18338 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
18339 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18340 ; AVX2-FCP-NEXT: vmovaps 2912(%rdi), %xmm0
18341 ; AVX2-FCP-NEXT: vinsertf128 $1, 3040(%rdi), %ymm0, %ymm0
18342 ; AVX2-FCP-NEXT: vmovaps 2848(%rdi), %xmm1
18343 ; AVX2-FCP-NEXT: vinsertf128 $1, 2976(%rdi), %ymm1, %ymm1
18344 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
18345 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18346 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
18347 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18348 ; AVX2-FCP-NEXT: vmovaps 3168(%rdi), %xmm0
18349 ; AVX2-FCP-NEXT: vinsertf128 $1, 3296(%rdi), %ymm0, %ymm0
18350 ; AVX2-FCP-NEXT: vmovaps 3104(%rdi), %xmm1
18351 ; AVX2-FCP-NEXT: vinsertf128 $1, 3232(%rdi), %ymm1, %ymm1
18352 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
18353 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18354 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
18355 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18356 ; AVX2-FCP-NEXT: vmovaps 3424(%rdi), %xmm0
18357 ; AVX2-FCP-NEXT: vinsertf128 $1, 3552(%rdi), %ymm0, %ymm0
18358 ; AVX2-FCP-NEXT: vmovaps 3360(%rdi), %xmm1
18359 ; AVX2-FCP-NEXT: vinsertf128 $1, 3488(%rdi), %ymm1, %ymm1
18360 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
18361 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18362 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
18363 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18364 ; AVX2-FCP-NEXT: vmovaps 3680(%rdi), %xmm0
18365 ; AVX2-FCP-NEXT: vinsertf128 $1, 3808(%rdi), %ymm0, %ymm0
18366 ; AVX2-FCP-NEXT: vmovaps 3616(%rdi), %xmm1
18367 ; AVX2-FCP-NEXT: vinsertf128 $1, 3744(%rdi), %ymm1, %ymm1
18368 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
18369 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18370 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
18371 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18372 ; AVX2-FCP-NEXT: vmovaps 3936(%rdi), %xmm0
18373 ; AVX2-FCP-NEXT: vinsertf128 $1, 4064(%rdi), %ymm0, %ymm0
18374 ; AVX2-FCP-NEXT: vmovaps 3872(%rdi), %xmm1
18375 ; AVX2-FCP-NEXT: vinsertf128 $1, 4000(%rdi), %ymm1, %ymm1
18376 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
18377 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18378 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
18379 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18380 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm2
18381 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18382 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm3
18383 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18384 ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm1
18385 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18386 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm0
18387 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18388 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
18389 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
18390 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
18391 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18392 ; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm2
18393 ; AVX2-FCP-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill
18394 ; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm3
18395 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18396 ; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm1
18397 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18398 ; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm0
18399 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18400 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
18401 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
18402 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
18403 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18404 ; AVX2-FCP-NEXT: vmovaps 608(%rdi), %ymm2
18405 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18406 ; AVX2-FCP-NEXT: vmovaps 544(%rdi), %ymm3
18407 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18408 ; AVX2-FCP-NEXT: vmovaps 736(%rdi), %ymm1
18409 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18410 ; AVX2-FCP-NEXT: vmovaps 672(%rdi), %ymm0
18411 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18412 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
18413 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
18414 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
18415 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18416 ; AVX2-FCP-NEXT: vmovaps 864(%rdi), %ymm2
18417 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18418 ; AVX2-FCP-NEXT: vmovaps 800(%rdi), %ymm3
18419 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18420 ; AVX2-FCP-NEXT: vmovaps 992(%rdi), %ymm1
18421 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18422 ; AVX2-FCP-NEXT: vmovaps 928(%rdi), %ymm0
18423 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18424 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
18425 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
18426 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
18427 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18428 ; AVX2-FCP-NEXT: vmovaps 1120(%rdi), %ymm2
18429 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18430 ; AVX2-FCP-NEXT: vmovaps 1056(%rdi), %ymm3
18431 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18432 ; AVX2-FCP-NEXT: vmovaps 1248(%rdi), %ymm1
18433 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18434 ; AVX2-FCP-NEXT: vmovaps 1184(%rdi), %ymm0
18435 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18436 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
18437 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
18438 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
18439 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18440 ; AVX2-FCP-NEXT: vmovaps 1376(%rdi), %ymm2
18441 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18442 ; AVX2-FCP-NEXT: vmovaps 1312(%rdi), %ymm3
18443 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18444 ; AVX2-FCP-NEXT: vmovaps 1504(%rdi), %ymm1
18445 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18446 ; AVX2-FCP-NEXT: vmovaps 1440(%rdi), %ymm0
18447 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18448 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
18449 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
18450 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
18451 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18452 ; AVX2-FCP-NEXT: vmovaps 1632(%rdi), %ymm2
18453 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18454 ; AVX2-FCP-NEXT: vmovaps 1568(%rdi), %ymm3
18455 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18456 ; AVX2-FCP-NEXT: vmovaps 1760(%rdi), %ymm1
18457 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18458 ; AVX2-FCP-NEXT: vmovaps 1696(%rdi), %ymm0
18459 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18460 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
18461 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
18462 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
18463 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18464 ; AVX2-FCP-NEXT: vmovaps 1888(%rdi), %ymm2
18465 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18466 ; AVX2-FCP-NEXT: vmovaps 1824(%rdi), %ymm3
18467 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18468 ; AVX2-FCP-NEXT: vmovaps 2016(%rdi), %ymm1
18469 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18470 ; AVX2-FCP-NEXT: vmovaps 1952(%rdi), %ymm0
18471 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18472 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
18473 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
18474 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
18475 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18476 ; AVX2-FCP-NEXT: vmovaps 2144(%rdi), %ymm2
18477 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18478 ; AVX2-FCP-NEXT: vmovaps 2080(%rdi), %ymm3
18479 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18480 ; AVX2-FCP-NEXT: vmovaps 2272(%rdi), %ymm1
18481 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18482 ; AVX2-FCP-NEXT: vmovaps 2208(%rdi), %ymm0
18483 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18484 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
18485 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
18486 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
18487 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18488 ; AVX2-FCP-NEXT: vmovaps 2400(%rdi), %ymm2
18489 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18490 ; AVX2-FCP-NEXT: vmovaps 2336(%rdi), %ymm3
18491 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18492 ; AVX2-FCP-NEXT: vmovaps 2528(%rdi), %ymm1
18493 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18494 ; AVX2-FCP-NEXT: vmovaps 2464(%rdi), %ymm0
18495 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18496 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
18497 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
18498 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
18499 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18500 ; AVX2-FCP-NEXT: vmovaps 2656(%rdi), %ymm2
18501 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18502 ; AVX2-FCP-NEXT: vmovaps 2592(%rdi), %ymm3
18503 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18504 ; AVX2-FCP-NEXT: vmovaps 2784(%rdi), %ymm1
18505 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18506 ; AVX2-FCP-NEXT: vmovaps 2720(%rdi), %ymm0
18507 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18508 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
18509 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
18510 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
18511 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18512 ; AVX2-FCP-NEXT: vmovaps 2912(%rdi), %ymm2
18513 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18514 ; AVX2-FCP-NEXT: vmovaps 2848(%rdi), %ymm3
18515 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18516 ; AVX2-FCP-NEXT: vmovaps 3040(%rdi), %ymm1
18517 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18518 ; AVX2-FCP-NEXT: vmovaps 2976(%rdi), %ymm0
18519 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18520 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
18521 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
18522 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
18523 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18524 ; AVX2-FCP-NEXT: vmovaps 3168(%rdi), %ymm2
18525 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18526 ; AVX2-FCP-NEXT: vmovaps 3104(%rdi), %ymm1
18527 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18528 ; AVX2-FCP-NEXT: vmovaps 3296(%rdi), %ymm14
18529 ; AVX2-FCP-NEXT: vmovaps 3232(%rdi), %ymm13
18530 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm14[0],ymm13[2],ymm14[2]
18531 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
18532 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
18533 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18534 ; AVX2-FCP-NEXT: vmovaps 3424(%rdi), %ymm12
18535 ; AVX2-FCP-NEXT: vmovaps 3360(%rdi), %ymm11
18536 ; AVX2-FCP-NEXT: vmovaps 3552(%rdi), %ymm10
18537 ; AVX2-FCP-NEXT: vmovaps 3488(%rdi), %ymm9
18538 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm10[0],ymm9[2],ymm10[2]
18539 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm11[0],ymm12[0],ymm11[2],ymm12[2]
18540 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
18541 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18542 ; AVX2-FCP-NEXT: vmovaps 3680(%rdi), %ymm8
18543 ; AVX2-FCP-NEXT: vmovaps 3616(%rdi), %ymm7
18544 ; AVX2-FCP-NEXT: vmovaps 3808(%rdi), %ymm6
18545 ; AVX2-FCP-NEXT: vmovaps 3744(%rdi), %ymm5
18546 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm6[0],ymm5[2],ymm6[2]
18547 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],ymm8[0],ymm7[2],ymm8[2]
18548 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
18549 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18550 ; AVX2-FCP-NEXT: vmovaps 3936(%rdi), %ymm4
18551 ; AVX2-FCP-NEXT: vmovaps 3872(%rdi), %ymm3
18552 ; AVX2-FCP-NEXT: vmovaps 4064(%rdi), %ymm2
18553 ; AVX2-FCP-NEXT: vmovaps 4000(%rdi), %ymm1
18554 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
18555 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
18556 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
18557 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18558 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
18559 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
18560 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
18561 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
18562 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
18563 ; AVX2-FCP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
18564 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
18565 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18566 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
18567 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
18568 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
18569 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
18570 ; AVX2-FCP-NEXT: vunpckhpd (%rsp), %ymm15, %ymm15 # 32-byte Folded Reload
18571 ; AVX2-FCP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
18572 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
18573 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18574 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
18575 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
18576 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
18577 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
18578 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
18579 ; AVX2-FCP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
18580 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
18581 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18582 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
18583 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
18584 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
18585 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
18586 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
18587 ; AVX2-FCP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
18588 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
18589 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18590 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
18591 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
18592 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
18593 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
18594 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
18595 ; AVX2-FCP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
18596 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
18597 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18598 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
18599 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
18600 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
18601 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
18602 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
18603 ; AVX2-FCP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
18604 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
18605 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18606 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
18607 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
18608 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
18609 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
18610 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
18611 ; AVX2-FCP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
18612 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
18613 ; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
18614 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
18615 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
18616 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
18617 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
18618 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
18619 ; AVX2-FCP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
18620 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
18621 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18622 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
18623 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
18624 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
18625 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
18626 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
18627 ; AVX2-FCP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
18628 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
18629 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18630 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
18631 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
18632 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
18633 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
18634 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
18635 ; AVX2-FCP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
18636 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
18637 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18638 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
18639 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
18640 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
18641 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
18642 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
18643 ; AVX2-FCP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
18644 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
18645 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18646 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
18647 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
18648 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
18649 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
18650 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
18651 ; AVX2-FCP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
18652 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm15[2,3],ymm0[2,3]
18653 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm14[1],ymm13[3],ymm14[3]
18654 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
18655 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
18656 ; AVX2-FCP-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
18657 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm0[2,3]
18658 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm10[1],ymm9[3],ymm10[3]
18659 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm11[1],ymm12[1],ymm11[3],ymm12[3]
18660 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm0[2,3]
18661 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm6[1],ymm5[3],ymm6[3]
18662 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm7[1],ymm8[1],ymm7[3],ymm8[3]
18663 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm0[2,3]
18664 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
18665 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm4[1],ymm3[3],ymm4[3]
18666 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
18667 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18668 ; AVX2-FCP-NEXT: vmovaps %ymm1, 448(%rsi)
18669 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18670 ; AVX2-FCP-NEXT: vmovaps %ymm1, 384(%rsi)
18671 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18672 ; AVX2-FCP-NEXT: vmovaps %ymm1, 320(%rsi)
18673 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18674 ; AVX2-FCP-NEXT: vmovaps %ymm1, 256(%rsi)
18675 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18676 ; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rsi)
18677 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18678 ; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rsi)
18679 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18680 ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rsi)
18681 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18682 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rsi)
18683 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18684 ; AVX2-FCP-NEXT: vmovaps %ymm1, 480(%rsi)
18685 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18686 ; AVX2-FCP-NEXT: vmovaps %ymm1, 416(%rsi)
18687 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18688 ; AVX2-FCP-NEXT: vmovaps %ymm1, 352(%rsi)
18689 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18690 ; AVX2-FCP-NEXT: vmovaps %ymm1, 288(%rsi)
18691 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18692 ; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rsi)
18693 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18694 ; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rsi)
18695 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18696 ; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rsi)
18697 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18698 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rsi)
18699 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18700 ; AVX2-FCP-NEXT: vmovaps %ymm1, 448(%rdx)
18701 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18702 ; AVX2-FCP-NEXT: vmovaps %ymm1, 384(%rdx)
18703 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18704 ; AVX2-FCP-NEXT: vmovaps %ymm1, 320(%rdx)
18705 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18706 ; AVX2-FCP-NEXT: vmovaps %ymm1, 256(%rdx)
18707 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18708 ; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rdx)
18709 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18710 ; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rdx)
18711 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18712 ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rdx)
18713 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18714 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rdx)
18715 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18716 ; AVX2-FCP-NEXT: vmovaps %ymm1, 480(%rdx)
18717 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18718 ; AVX2-FCP-NEXT: vmovaps %ymm1, 416(%rdx)
18719 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18720 ; AVX2-FCP-NEXT: vmovaps %ymm1, 352(%rdx)
18721 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18722 ; AVX2-FCP-NEXT: vmovaps %ymm1, 288(%rdx)
18723 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18724 ; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rdx)
18725 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18726 ; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rdx)
18727 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18728 ; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rdx)
18729 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18730 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rdx)
18731 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18732 ; AVX2-FCP-NEXT: vmovaps %ymm1, 448(%rcx)
18733 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18734 ; AVX2-FCP-NEXT: vmovaps %ymm1, 384(%rcx)
18735 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18736 ; AVX2-FCP-NEXT: vmovaps %ymm1, 320(%rcx)
18737 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18738 ; AVX2-FCP-NEXT: vmovaps %ymm1, 256(%rcx)
18739 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18740 ; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rcx)
18741 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18742 ; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rcx)
18743 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18744 ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rcx)
18745 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18746 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rcx)
18747 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18748 ; AVX2-FCP-NEXT: vmovaps %ymm1, 480(%rcx)
18749 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18750 ; AVX2-FCP-NEXT: vmovaps %ymm1, 416(%rcx)
18751 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18752 ; AVX2-FCP-NEXT: vmovaps %ymm1, 352(%rcx)
18753 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18754 ; AVX2-FCP-NEXT: vmovaps %ymm1, 288(%rcx)
18755 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18756 ; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rcx)
18757 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18758 ; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rcx)
18759 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18760 ; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rcx)
18761 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18762 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rcx)
18763 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18764 ; AVX2-FCP-NEXT: vmovaps %ymm1, 480(%r8)
18765 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18766 ; AVX2-FCP-NEXT: vmovaps %ymm1, 448(%r8)
18767 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18768 ; AVX2-FCP-NEXT: vmovaps %ymm1, 416(%r8)
18769 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18770 ; AVX2-FCP-NEXT: vmovaps %ymm1, 384(%r8)
18771 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18772 ; AVX2-FCP-NEXT: vmovaps %ymm1, 352(%r8)
18773 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18774 ; AVX2-FCP-NEXT: vmovaps %ymm1, 320(%r8)
18775 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18776 ; AVX2-FCP-NEXT: vmovaps %ymm1, 288(%r8)
18777 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18778 ; AVX2-FCP-NEXT: vmovaps %ymm1, 256(%r8)
18779 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18780 ; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%r8)
18781 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18782 ; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%r8)
18783 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18784 ; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%r8)
18785 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18786 ; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%r8)
18787 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18788 ; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%r8)
18789 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18790 ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%r8)
18791 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18792 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%r8)
18793 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18794 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%r8)
18795 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18796 ; AVX2-FCP-NEXT: vmovaps %ymm1, 480(%r9)
18797 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18798 ; AVX2-FCP-NEXT: vmovaps %ymm1, 448(%r9)
18799 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18800 ; AVX2-FCP-NEXT: vmovaps %ymm1, 416(%r9)
18801 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18802 ; AVX2-FCP-NEXT: vmovaps %ymm1, 384(%r9)
18803 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18804 ; AVX2-FCP-NEXT: vmovaps %ymm1, 352(%r9)
18805 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18806 ; AVX2-FCP-NEXT: vmovaps %ymm1, 320(%r9)
18807 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18808 ; AVX2-FCP-NEXT: vmovaps %ymm1, 288(%r9)
18809 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18810 ; AVX2-FCP-NEXT: vmovaps %ymm1, 256(%r9)
18811 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18812 ; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%r9)
18813 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18814 ; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%r9)
18815 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18816 ; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%r9)
18817 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18818 ; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%r9)
18819 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18820 ; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%r9)
18821 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18822 ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%r9)
18823 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18824 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%r9)
18825 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18826 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%r9)
18827 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
18828 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18829 ; AVX2-FCP-NEXT: vmovaps %ymm1, 480(%rax)
18830 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18831 ; AVX2-FCP-NEXT: vmovaps %ymm1, 448(%rax)
18832 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18833 ; AVX2-FCP-NEXT: vmovaps %ymm1, 416(%rax)
18834 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18835 ; AVX2-FCP-NEXT: vmovaps %ymm1, 384(%rax)
18836 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18837 ; AVX2-FCP-NEXT: vmovaps %ymm1, 352(%rax)
18838 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18839 ; AVX2-FCP-NEXT: vmovaps %ymm1, 320(%rax)
18840 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18841 ; AVX2-FCP-NEXT: vmovaps %ymm1, 288(%rax)
18842 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18843 ; AVX2-FCP-NEXT: vmovaps %ymm1, 256(%rax)
18844 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18845 ; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rax)
18846 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18847 ; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rax)
18848 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18849 ; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rax)
18850 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18851 ; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rax)
18852 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18853 ; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rax)
18854 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18855 ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rax)
18856 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18857 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rax)
18858 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18859 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rax)
18860 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
18861 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18862 ; AVX2-FCP-NEXT: vmovaps %ymm1, 480(%rax)
18863 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18864 ; AVX2-FCP-NEXT: vmovaps %ymm1, 448(%rax)
18865 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18866 ; AVX2-FCP-NEXT: vmovaps %ymm1, 416(%rax)
18867 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18868 ; AVX2-FCP-NEXT: vmovaps %ymm1, 384(%rax)
18869 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18870 ; AVX2-FCP-NEXT: vmovaps %ymm1, 352(%rax)
18871 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18872 ; AVX2-FCP-NEXT: vmovaps %ymm1, 320(%rax)
18873 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18874 ; AVX2-FCP-NEXT: vmovaps %ymm1, 288(%rax)
18875 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18876 ; AVX2-FCP-NEXT: vmovaps %ymm1, 256(%rax)
18877 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18878 ; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rax)
18879 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18880 ; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rax)
18881 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18882 ; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rax)
18883 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18884 ; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rax)
18885 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18886 ; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rax)
18887 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18888 ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rax)
18889 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18890 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rax)
18891 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
18892 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rax)
18893 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
18894 ; AVX2-FCP-NEXT: vmovaps %ymm0, 480(%rax)
18895 ; AVX2-FCP-NEXT: vmovaps %ymm5, 448(%rax)
18896 ; AVX2-FCP-NEXT: vmovaps %ymm9, 416(%rax)
18897 ; AVX2-FCP-NEXT: vmovaps %ymm13, 384(%rax)
18898 ; AVX2-FCP-NEXT: vmovaps %ymm15, 352(%rax)
18899 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
18900 ; AVX2-FCP-NEXT: vmovaps %ymm0, 320(%rax)
18901 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
18902 ; AVX2-FCP-NEXT: vmovaps %ymm0, 288(%rax)
18903 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
18904 ; AVX2-FCP-NEXT: vmovaps %ymm0, 256(%rax)
18905 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
18906 ; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%rax)
18907 ; AVX2-FCP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
18908 ; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%rax)
18909 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
18910 ; AVX2-FCP-NEXT: vmovaps %ymm0, 160(%rax)
18911 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
18912 ; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%rax)
18913 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
18914 ; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%rax)
18915 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
18916 ; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rax)
18917 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
18918 ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rax)
18919 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
18920 ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax)
18921 ; AVX2-FCP-NEXT: addq $5064, %rsp # imm = 0x13C8
18922 ; AVX2-FCP-NEXT: vzeroupper
18923 ; AVX2-FCP-NEXT: retq
18925 ; AVX512-LABEL: load_i64_stride8_vf64:
18927 ; AVX512-NEXT: subq $6664, %rsp # imm = 0x1A08
18928 ; AVX512-NEXT: vmovdqa64 3392(%rdi), %zmm3
18929 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18930 ; AVX512-NEXT: vmovdqa64 3328(%rdi), %zmm16
18931 ; AVX512-NEXT: vmovdqa64 3520(%rdi), %zmm8
18932 ; AVX512-NEXT: vmovdqa64 3456(%rdi), %zmm28
18933 ; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm10
18934 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18935 ; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm11
18936 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18937 ; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm4
18938 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18939 ; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm5
18940 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18941 ; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm6
18942 ; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm9
18943 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18944 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm12
18945 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18946 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm15
18947 ; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18948 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm13
18949 ; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18950 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm14
18951 ; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18952 ; AVX512-NEXT: movb $-64, %al
18953 ; AVX512-NEXT: kmovw %eax, %k1
18954 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
18955 ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18956 ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm0
18957 ; AVX512-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18958 ; AVX512-NEXT: vpermt2q %zmm8, %zmm2, %zmm0
18959 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18960 ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm1
18961 ; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
18962 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
18963 ; AVX512-NEXT: vmovdqa 3264(%rdi), %ymm3
18964 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18965 ; AVX512-NEXT: vmovdqa 3200(%rdi), %ymm0
18966 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18967 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
18968 ; AVX512-NEXT: vmovdqa 3136(%rdi), %ymm3
18969 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18970 ; AVX512-NEXT: vmovdqa 3072(%rdi), %ymm7
18971 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2]
18972 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
18973 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
18974 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18975 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm0
18976 ; AVX512-NEXT: vpermt2q %zmm6, %zmm2, %zmm0
18977 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm9
18978 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18979 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1
18980 ; AVX512-NEXT: vpermt2q %zmm4, %zmm2, %zmm1
18981 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
18982 ; AVX512-NEXT: vmovdqa 704(%rdi), %ymm0
18983 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18984 ; AVX512-NEXT: vmovdqa64 640(%rdi), %ymm20
18985 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm20[0],ymm0[0],ymm20[2],ymm0[2]
18986 ; AVX512-NEXT: vmovdqa64 576(%rdi), %ymm22
18987 ; AVX512-NEXT: vmovdqa64 512(%rdi), %ymm19
18988 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2]
18989 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
18990 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
18991 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18992 ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm0
18993 ; AVX512-NEXT: vpermt2q %zmm13, %zmm2, %zmm0
18994 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm1
18995 ; AVX512-NEXT: vpermt2q %zmm12, %zmm2, %zmm1
18996 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
18997 ; AVX512-NEXT: vmovdqa 192(%rdi), %ymm14
18998 ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm15
18999 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
19000 ; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm21
19001 ; AVX512-NEXT: vmovdqa (%rdi), %ymm13
19002 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2]
19003 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
19004 ; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm3
19005 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19006 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
19007 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19008 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0
19009 ; AVX512-NEXT: vpermt2q %zmm11, %zmm2, %zmm0
19010 ; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm1
19011 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19012 ; AVX512-NEXT: vpermt2q %zmm10, %zmm2, %zmm1
19013 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
19014 ; AVX512-NEXT: vmovdqa 1728(%rdi), %ymm3
19015 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
19016 ; AVX512-NEXT: vmovdqa 1664(%rdi), %ymm0
19017 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
19018 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
19019 ; AVX512-NEXT: vmovdqa 1600(%rdi), %ymm4
19020 ; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
19021 ; AVX512-NEXT: vmovdqa 1536(%rdi), %ymm3
19022 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
19023 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
19024 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
19025 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
19026 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19027 ; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm1
19028 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19029 ; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm0
19030 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19031 ; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
19032 ; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm3
19033 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19034 ; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm1
19035 ; AVX512-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
19036 ; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
19037 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
19038 ; AVX512-NEXT: vmovdqa 1216(%rdi), %ymm3
19039 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
19040 ; AVX512-NEXT: vmovdqa 1152(%rdi), %ymm0
19041 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
19042 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
19043 ; AVX512-NEXT: vmovdqa 1088(%rdi), %ymm4
19044 ; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
19045 ; AVX512-NEXT: vmovdqa 1024(%rdi), %ymm3
19046 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
19047 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
19048 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
19049 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
19050 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19051 ; AVX512-NEXT: vmovdqa64 3008(%rdi), %zmm1
19052 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19053 ; AVX512-NEXT: vmovdqa64 2944(%rdi), %zmm29
19054 ; AVX512-NEXT: vmovdqa64 %zmm29, %zmm0
19055 ; AVX512-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19056 ; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
19057 ; AVX512-NEXT: vmovdqa64 2880(%rdi), %zmm24
19058 ; AVX512-NEXT: vmovdqa64 2816(%rdi), %zmm25
19059 ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm1
19060 ; AVX512-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19061 ; AVX512-NEXT: vpermt2q %zmm24, %zmm2, %zmm1
19062 ; AVX512-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19063 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
19064 ; AVX512-NEXT: vmovdqa64 2752(%rdi), %ymm27
19065 ; AVX512-NEXT: vmovdqa64 2688(%rdi), %ymm26
19066 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm27[0],ymm26[2],ymm27[2]
19067 ; AVX512-NEXT: vmovdqa64 2624(%rdi), %ymm30
19068 ; AVX512-NEXT: vmovdqa64 2560(%rdi), %ymm18
19069 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm30[0],ymm18[2],ymm30[2]
19070 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
19071 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
19072 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19073 ; AVX512-NEXT: vmovdqa64 2496(%rdi), %zmm1
19074 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19075 ; AVX512-NEXT: vmovdqa64 2432(%rdi), %zmm31
19076 ; AVX512-NEXT: vmovdqa64 %zmm31, %zmm0
19077 ; AVX512-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19078 ; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
19079 ; AVX512-NEXT: vmovdqa64 2368(%rdi), %zmm3
19080 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19081 ; AVX512-NEXT: vmovdqa64 2304(%rdi), %zmm1
19082 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19083 ; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
19084 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
19085 ; AVX512-NEXT: vmovdqa 2240(%rdi), %ymm12
19086 ; AVX512-NEXT: vmovdqa 2176(%rdi), %ymm11
19087 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2]
19088 ; AVX512-NEXT: vmovdqa 2112(%rdi), %ymm10
19089 ; AVX512-NEXT: vmovdqa 2048(%rdi), %ymm3
19090 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm10[0],ymm3[2],ymm10[2]
19091 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3]
19092 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
19093 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19094 ; AVX512-NEXT: vmovdqa64 4032(%rdi), %zmm1
19095 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19096 ; AVX512-NEXT: vmovdqa64 3968(%rdi), %zmm0
19097 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19098 ; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
19099 ; AVX512-NEXT: vmovdqa64 3904(%rdi), %zmm1
19100 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19101 ; AVX512-NEXT: vmovdqa64 3840(%rdi), %zmm4
19102 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19103 ; AVX512-NEXT: vpermi2q %zmm1, %zmm4, %zmm2
19104 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
19105 ; AVX512-NEXT: vmovdqa64 3776(%rdi), %ymm17
19106 ; AVX512-NEXT: vmovdqa64 3712(%rdi), %ymm23
19107 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm23[0],ymm17[0],ymm23[2],ymm17[2]
19108 ; AVX512-NEXT: vmovdqa 3648(%rdi), %ymm1
19109 ; AVX512-NEXT: vmovdqa 3584(%rdi), %ymm0
19110 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
19111 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3]
19112 ; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
19113 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19114 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
19115 ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19116 ; AVX512-NEXT: vpermt2q %zmm8, %zmm2, %zmm28
19117 ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm8
19118 ; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19119 ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm6
19120 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
19121 ; AVX512-NEXT: vpermt2q %zmm16, %zmm2, %zmm6
19122 ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm6 {%k1}
19123 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
19124 ; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload
19125 ; AVX512-NEXT: # ymm5 = ymm4[1],mem[1],ymm4[3],mem[3]
19126 ; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
19127 ; AVX512-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
19128 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
19129 ; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
19130 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19131 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
19132 ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm5
19133 ; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm5
19134 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
19135 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm6
19136 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
19137 ; AVX512-NEXT: vpermt2q %zmm4, %zmm2, %zmm6
19138 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
19139 ; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload
19140 ; AVX512-NEXT: # ymm5 = ymm20[1],mem[1],ymm20[3],mem[3]
19141 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm19[1],ymm22[1],ymm19[3],ymm22[3]
19142 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
19143 ; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
19144 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19145 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
19146 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm5
19147 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
19148 ; AVX512-NEXT: vpermt2q %zmm19, %zmm2, %zmm5
19149 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19150 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
19151 ; AVX512-NEXT: vpermt2q %zmm22, %zmm2, %zmm6
19152 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
19153 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
19154 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
19155 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
19156 ; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
19157 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19158 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
19159 ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm5
19160 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
19161 ; AVX512-NEXT: vpermt2q %zmm15, %zmm2, %zmm5
19162 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
19163 ; AVX512-NEXT: vmovdqa64 %zmm21, %zmm6
19164 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
19165 ; AVX512-NEXT: vpermt2q %zmm13, %zmm2, %zmm6
19166 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
19167 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
19168 ; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
19169 ; AVX512-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
19170 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
19171 ; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
19172 ; AVX512-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
19173 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
19174 ; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
19175 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19176 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19177 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload
19178 ; AVX512-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
19179 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload
19180 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
19181 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
19182 ; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
19183 ; AVX512-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
19184 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
19185 ; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
19186 ; AVX512-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
19187 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
19188 ; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
19189 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19190 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm29 # 64-byte Folded Reload
19191 ; AVX512-NEXT: vpermt2q %zmm24, %zmm2, %zmm25
19192 ; AVX512-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1}
19193 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm27[1],ymm26[3],ymm27[3]
19194 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm18[1],ymm30[1],ymm18[3],ymm30[3]
19195 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
19196 ; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm25, %zmm5
19197 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19198 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
19199 ; AVX512-NEXT: vpermt2q %zmm24, %zmm2, %zmm31
19200 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
19201 ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm6
19202 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload
19203 ; AVX512-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1}
19204 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm12[1],ymm11[3],ymm12[3]
19205 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm10[1],ymm3[3],ymm10[3]
19206 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3]
19207 ; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3
19208 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19209 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
19210 ; AVX512-NEXT: vmovdqa64 %zmm29, %zmm3
19211 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
19212 ; AVX512-NEXT: vpermt2q %zmm18, %zmm2, %zmm3
19213 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
19214 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19215 ; AVX512-NEXT: vpermi2q %zmm11, %zmm12, %zmm2
19216 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1}
19217 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm17[1],ymm23[3],ymm17[3]
19218 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
19219 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
19220 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
19221 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19222 ; AVX512-NEXT: vmovdqa64 3264(%rdi), %zmm3
19223 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19224 ; AVX512-NEXT: vmovdqa64 3200(%rdi), %zmm10
19225 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
19226 ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19227 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm1
19228 ; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm1
19229 ; AVX512-NEXT: vmovdqa64 3136(%rdi), %zmm23
19230 ; AVX512-NEXT: vmovdqa64 3072(%rdi), %zmm2
19231 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19232 ; AVX512-NEXT: vpermt2q %zmm23, %zmm0, %zmm2
19233 ; AVX512-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19234 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
19235 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
19236 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm2
19237 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
19238 ; AVX512-NEXT: vpermt2q %zmm25, %zmm0, %zmm2
19239 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6]
19240 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
19241 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
19242 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19243 ; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm2
19244 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19245 ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm1
19246 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19247 ; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
19248 ; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm3
19249 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19250 ; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm2
19251 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19252 ; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
19253 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
19254 ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm2
19255 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
19256 ; AVX512-NEXT: vpermt2q %zmm30, %zmm0, %zmm2
19257 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm4[0],zmm9[2],zmm4[2],zmm9[4],zmm4[4],zmm9[6],zmm4[6]
19258 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
19259 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
19260 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19261 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm2
19262 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19263 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm1
19264 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19265 ; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
19266 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3
19267 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19268 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2
19269 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19270 ; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
19271 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
19272 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm2
19273 ; AVX512-NEXT: vpermt2q %zmm19, %zmm0, %zmm2
19274 ; AVX512-NEXT: vmovdqa64 %zmm19, %zmm16
19275 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19276 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm22[0],zmm3[2],zmm22[2],zmm3[4],zmm22[4],zmm3[6],zmm22[6]
19277 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
19278 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
19279 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19280 ; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm2
19281 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19282 ; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm1
19283 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19284 ; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
19285 ; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm3
19286 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19287 ; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm2
19288 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19289 ; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
19290 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
19291 ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm2
19292 ; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
19293 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm13[0],zmm21[2],zmm13[2],zmm21[4],zmm13[4],zmm21[6],zmm13[6]
19294 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
19295 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
19296 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19297 ; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm2
19298 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19299 ; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm1
19300 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19301 ; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
19302 ; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm3
19303 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19304 ; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm2
19305 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19306 ; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
19307 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
19308 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
19309 ; AVX512-NEXT: vmovdqa64 %zmm31, %zmm2
19310 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
19311 ; AVX512-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload
19312 ; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload
19313 ; AVX512-NEXT: # zmm3 = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6]
19314 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
19315 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
19316 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19317 ; AVX512-NEXT: vmovdqa64 2752(%rdi), %zmm3
19318 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19319 ; AVX512-NEXT: vmovdqa64 2688(%rdi), %zmm8
19320 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm1
19321 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19322 ; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm1
19323 ; AVX512-NEXT: vmovdqa64 2624(%rdi), %zmm3
19324 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19325 ; AVX512-NEXT: vmovdqa64 2560(%rdi), %zmm2
19326 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19327 ; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
19328 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
19329 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
19330 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm2
19331 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
19332 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
19333 ; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload
19334 ; AVX512-NEXT: # zmm3 = zmm7[0],mem[0],zmm7[2],mem[2],zmm7[4],mem[4],zmm7[6],mem[6]
19335 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
19336 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
19337 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19338 ; AVX512-NEXT: vmovdqa64 2240(%rdi), %zmm2
19339 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19340 ; AVX512-NEXT: vmovdqa64 2176(%rdi), %zmm1
19341 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19342 ; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
19343 ; AVX512-NEXT: vmovdqa64 2112(%rdi), %zmm3
19344 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19345 ; AVX512-NEXT: vmovdqa64 2048(%rdi), %zmm2
19346 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19347 ; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
19348 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
19349 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19350 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm2
19351 ; AVX512-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
19352 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
19353 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm26[0],zmm4[0],zmm26[2],zmm4[2],zmm26[4],zmm4[4],zmm26[6],zmm4[6]
19354 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
19355 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
19356 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19357 ; AVX512-NEXT: vmovdqa64 3776(%rdi), %zmm2
19358 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19359 ; AVX512-NEXT: vmovdqa64 3712(%rdi), %zmm1
19360 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19361 ; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
19362 ; AVX512-NEXT: vmovdqa64 3648(%rdi), %zmm3
19363 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19364 ; AVX512-NEXT: vmovdqa64 3584(%rdi), %zmm2
19365 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19366 ; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
19367 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
19368 ; AVX512-NEXT: vpermi2q %zmm18, %zmm29, %zmm0
19369 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm12[0],zmm11[0],zmm12[2],zmm11[2],zmm12[4],zmm11[4],zmm12[6],zmm11[6]
19370 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
19371 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0
19372 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19373 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
19374 ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19375 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19376 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
19377 ; AVX512-NEXT: vpermt2q %zmm26, %zmm0, %zmm10
19378 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19379 ; AVX512-NEXT: vpermt2q %zmm23, %zmm0, %zmm2
19380 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7]
19381 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm2
19382 ; AVX512-NEXT: vpermt2q %zmm25, %zmm0, %zmm2
19383 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
19384 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
19385 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
19386 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
19387 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
19388 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19389 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19390 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
19391 ; AVX512-NEXT: vpermt2q %zmm22, %zmm0, %zmm1
19392 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19393 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
19394 ; AVX512-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
19395 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
19396 ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm29
19397 ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm2
19398 ; AVX512-NEXT: vpermt2q %zmm30, %zmm0, %zmm2
19399 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
19400 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
19401 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7]
19402 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
19403 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
19404 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19405 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19406 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
19407 ; AVX512-NEXT: vpermt2q %zmm20, %zmm0, %zmm1
19408 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19409 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
19410 ; AVX512-NEXT: vpermt2q %zmm21, %zmm0, %zmm2
19411 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
19412 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19413 ; AVX512-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
19414 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
19415 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19416 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm12[1],zmm18[1],zmm12[3],zmm18[3],zmm12[5],zmm18[5],zmm12[7],zmm18[7]
19417 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
19418 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
19419 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19420 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19421 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
19422 ; AVX512-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
19423 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19424 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
19425 ; AVX512-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
19426 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
19427 ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm2
19428 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
19429 ; AVX512-NEXT: vpermt2q %zmm23, %zmm0, %zmm2
19430 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19431 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm13[1],zmm3[3],zmm13[3],zmm3[5],zmm13[5],zmm3[7],zmm13[7]
19432 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
19433 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
19434 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19435 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19436 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
19437 ; AVX512-NEXT: vpermt2q %zmm13, %zmm0, %zmm1
19438 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19439 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
19440 ; AVX512-NEXT: vpermt2q %zmm14, %zmm0, %zmm2
19441 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
19442 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload
19443 ; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload
19444 ; AVX512-NEXT: # zmm3 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7]
19445 ; AVX512-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1}
19446 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
19447 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19448 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
19449 ; AVX512-NEXT: vpermt2q %zmm31, %zmm0, %zmm8
19450 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19451 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
19452 ; AVX512-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
19453 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm8[4,5,6,7]
19454 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload
19455 ; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload
19456 ; AVX512-NEXT: # zmm3 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7]
19457 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1}
19458 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
19459 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19460 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19461 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
19462 ; AVX512-NEXT: vpermt2q %zmm27, %zmm0, %zmm1
19463 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
19464 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm2
19465 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19466 ; AVX512-NEXT: vpermt2q %zmm8, %zmm0, %zmm2
19467 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
19468 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
19469 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19470 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7]
19471 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1}
19472 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
19473 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19474 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19475 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1
19476 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19477 ; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm1
19478 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
19479 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm2
19480 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19481 ; AVX512-NEXT: vpermt2q %zmm6, %zmm0, %zmm2
19482 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
19483 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
19484 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
19485 ; AVX512-NEXT: vpermi2q %zmm10, %zmm9, %zmm0
19486 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19487 ; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload
19488 ; AVX512-NEXT: # zmm2 = zmm2[1],mem[1],zmm2[3],mem[3],zmm2[5],mem[5],zmm2[7],mem[7]
19489 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
19490 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0
19491 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19492 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
19493 ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19494 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19495 ; AVX512-NEXT: vpermt2q %zmm26, %zmm0, %zmm1
19496 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19497 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
19498 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
19499 ; AVX512-NEXT: vmovdqa64 %zmm19, %zmm2
19500 ; AVX512-NEXT: vpermt2q %zmm25, %zmm0, %zmm2
19501 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
19502 ; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 {%k1} # 64-byte Folded Reload
19503 ; AVX512-NEXT: # zmm2 {%k1} = zmm25[0],mem[0],zmm25[2],mem[2],zmm25[4],mem[4],zmm25[6],mem[6]
19504 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
19505 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19506 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19507 ; AVX512-NEXT: vpermt2q %zmm22, %zmm0, %zmm1
19508 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19509 ; AVX512-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
19510 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
19511 ; AVX512-NEXT: vpermt2q %zmm28, %zmm0, %zmm15
19512 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm29[0],zmm30[0],zmm29[2],zmm30[2],zmm29[4],zmm30[4],zmm29[6],zmm30[6]
19513 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1
19514 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19515 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19516 ; AVX512-NEXT: vpermt2q %zmm20, %zmm0, %zmm1
19517 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19518 ; AVX512-NEXT: vpermt2q %zmm21, %zmm0, %zmm2
19519 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
19520 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm2
19521 ; AVX512-NEXT: vpermt2q %zmm18, %zmm0, %zmm2
19522 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
19523 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
19524 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm30[0],zmm26[0],zmm30[2],zmm26[2],zmm30[4],zmm26[4],zmm30[6],zmm26[6]
19525 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
19526 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19527 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19528 ; AVX512-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
19529 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19530 ; AVX512-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
19531 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
19532 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
19533 ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm2
19534 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
19535 ; AVX512-NEXT: vpermt2q %zmm29, %zmm0, %zmm2
19536 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
19537 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm24[0],zmm23[0],zmm24[2],zmm23[2],zmm24[4],zmm23[4],zmm24[6],zmm23[6]
19538 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
19539 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19540 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19541 ; AVX512-NEXT: vpermt2q %zmm13, %zmm0, %zmm1
19542 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19543 ; AVX512-NEXT: vpermt2q %zmm14, %zmm0, %zmm2
19544 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
19545 ; AVX512-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
19546 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
19547 ; AVX512-NEXT: vpermt2q %zmm21, %zmm0, %zmm2
19548 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
19549 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
19550 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
19551 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
19552 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19553 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19554 ; AVX512-NEXT: vpermt2q %zmm31, %zmm0, %zmm1
19555 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19556 ; AVX512-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
19557 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
19558 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19559 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
19560 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
19561 ; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm2 {%k1} # 64-byte Folded Reload
19562 ; AVX512-NEXT: # zmm2 {%k1} = zmm11[0],mem[0],zmm11[2],mem[2],zmm11[4],mem[4],zmm11[6],mem[6]
19563 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
19564 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19565 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19566 ; AVX512-NEXT: vpermt2q %zmm27, %zmm0, %zmm1
19567 ; AVX512-NEXT: vpermt2q %zmm8, %zmm0, %zmm7
19568 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
19569 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
19570 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm2
19571 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
19572 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
19573 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
19574 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm28[0],zmm7[2],zmm28[2],zmm7[4],zmm28[4],zmm7[6],zmm28[6]
19575 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
19576 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19577 ; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm3
19578 ; AVX512-NEXT: vpermt2q %zmm6, %zmm0, %zmm4
19579 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7]
19580 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
19581 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
19582 ; AVX512-NEXT: vpermi2q %zmm25, %zmm15, %zmm0
19583 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6]
19584 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
19585 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19586 ; AVX512-NEXT: vmovdqa64 %zmm19, %zmm1
19587 ; AVX512-NEXT: vmovdqa64 %zmm19, %zmm0
19588 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15]
19589 ; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19590 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19591 ; AVX512-NEXT: vpermt2q %zmm3, %zmm5, %zmm19
19592 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
19593 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
19594 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7]
19595 ; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19596 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm6
19597 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8]
19598 ; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19599 ; AVX512-NEXT: vpermt2q %zmm4, %zmm12, %zmm17
19600 ; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19601 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9]
19602 ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19603 ; AVX512-NEXT: vpermt2q %zmm4, %zmm2, %zmm6
19604 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19605 ; AVX512-NEXT: vpermt2q %zmm3, %zmm12, %zmm1
19606 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19607 ; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm0
19608 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19609 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19610 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm20
19611 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0
19612 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
19613 ; AVX512-NEXT: vpermt2q %zmm4, %zmm5, %zmm1
19614 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
19615 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
19616 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm11[1],zmm13[3],zmm11[3],zmm13[5],zmm11[5],zmm13[7],zmm11[7]
19617 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19618 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm1
19619 ; AVX512-NEXT: vpermt2q %zmm11, %zmm12, %zmm13
19620 ; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19621 ; AVX512-NEXT: vpermt2q %zmm11, %zmm2, %zmm1
19622 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19623 ; AVX512-NEXT: vpermt2q %zmm4, %zmm12, %zmm20
19624 ; AVX512-NEXT: vpermt2q %zmm4, %zmm2, %zmm0
19625 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19626 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19627 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm17
19628 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0
19629 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
19630 ; AVX512-NEXT: vpermt2q %zmm9, %zmm5, %zmm1
19631 ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm10
19632 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm30[1],zmm26[1],zmm30[3],zmm26[3],zmm30[5],zmm26[5],zmm30[7],zmm26[7]
19633 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19634 ; AVX512-NEXT: vmovdqa64 %zmm30, %zmm1
19635 ; AVX512-NEXT: vmovdqa64 %zmm30, %zmm26
19636 ; AVX512-NEXT: vpermt2q %zmm10, %zmm12, %zmm26
19637 ; AVX512-NEXT: vpermt2q %zmm10, %zmm2, %zmm1
19638 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19639 ; AVX512-NEXT: vpermt2q %zmm9, %zmm12, %zmm17
19640 ; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm0
19641 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19642 ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm19
19643 ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm31
19644 ; AVX512-NEXT: vmovdqa64 %zmm29, %zmm3
19645 ; AVX512-NEXT: vpermt2q %zmm29, %zmm5, %zmm16
19646 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19647 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm24[1],zmm6[1],zmm24[3],zmm6[3],zmm24[5],zmm6[5],zmm24[7],zmm6[7]
19648 ; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19649 ; AVX512-NEXT: vmovdqa64 %zmm24, %zmm0
19650 ; AVX512-NEXT: vmovdqa64 %zmm24, %zmm29
19651 ; AVX512-NEXT: vpermt2q %zmm6, %zmm12, %zmm29
19652 ; AVX512-NEXT: vpermt2q %zmm6, %zmm2, %zmm0
19653 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19654 ; AVX512-NEXT: vpermt2q %zmm3, %zmm12, %zmm19
19655 ; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm31
19656 ; AVX512-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
19657 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm16
19658 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm27
19659 ; AVX512-NEXT: vpermt2q %zmm21, %zmm5, %zmm1
19660 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7]
19661 ; AVX512-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
19662 ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm1
19663 ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm24
19664 ; AVX512-NEXT: vpermt2q %zmm18, %zmm12, %zmm24
19665 ; AVX512-NEXT: vpermt2q %zmm18, %zmm2, %zmm1
19666 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19667 ; AVX512-NEXT: vpermt2q %zmm21, %zmm12, %zmm16
19668 ; AVX512-NEXT: vpermt2q %zmm21, %zmm2, %zmm27
19669 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19670 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm18
19671 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm30
19672 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19673 ; AVX512-NEXT: vpermt2q %zmm8, %zmm5, %zmm1
19674 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19675 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
19676 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm10[1],zmm0[1],zmm10[3],zmm0[3],zmm10[5],zmm0[5],zmm10[7],zmm0[7]
19677 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19678 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm1
19679 ; AVX512-NEXT: vpermt2q %zmm0, %zmm12, %zmm10
19680 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19681 ; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
19682 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19683 ; AVX512-NEXT: vpermt2q %zmm8, %zmm12, %zmm18
19684 ; AVX512-NEXT: vpermt2q %zmm8, %zmm2, %zmm30
19685 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm6
19686 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm23
19687 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19688 ; AVX512-NEXT: vpermt2q %zmm8, %zmm5, %zmm6
19689 ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm0
19690 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
19691 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm21[1],zmm28[1],zmm21[3],zmm28[3],zmm21[5],zmm28[5],zmm21[7],zmm28[7]
19692 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm28
19693 ; AVX512-NEXT: vmovdqa64 %zmm21, %zmm1
19694 ; AVX512-NEXT: vpermt2q %zmm0, %zmm12, %zmm21
19695 ; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
19696 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19697 ; AVX512-NEXT: vpermt2q %zmm8, %zmm12, %zmm23
19698 ; AVX512-NEXT: vpermt2q %zmm8, %zmm2, %zmm22
19699 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
19700 ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm0
19701 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19702 ; AVX512-NEXT: vpermt2q %zmm8, %zmm12, %zmm0
19703 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19704 ; AVX512-NEXT: vpermi2q %zmm25, %zmm15, %zmm12
19705 ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm3
19706 ; AVX512-NEXT: vpermt2q %zmm8, %zmm2, %zmm3
19707 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19708 ; AVX512-NEXT: vpermi2q %zmm25, %zmm15, %zmm2
19709 ; AVX512-NEXT: vpermt2q %zmm25, %zmm5, %zmm15
19710 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm14[1],zmm8[1],zmm14[3],zmm8[3],zmm14[5],zmm8[5],zmm14[7],zmm8[7]
19711 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm25
19712 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
19713 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload
19714 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19715 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload
19716 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19717 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload
19718 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19719 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload
19720 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
19721 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 # 64-byte Folded Reload
19722 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19723 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload
19724 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
19725 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload
19726 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
19727 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload
19728 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
19729 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload
19730 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19731 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload
19732 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
19733 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload
19734 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
19735 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
19736 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19737 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
19738 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
19739 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19740 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
19741 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
19742 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19743 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
19744 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
19745 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
19746 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
19747 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
19748 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19749 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
19750 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19751 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm8[4,5,6,7]
19752 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19753 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
19754 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19755 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7]
19756 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19757 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
19758 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19759 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5,6,7]
19760 ; AVX512-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
19761 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
19762 ; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
19763 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5,6,7]
19764 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19765 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
19766 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19767 ; AVX512-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload
19768 ; AVX512-NEXT: # ymm0 = mem[0,1,2,3],ymm15[4,5,6,7]
19769 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0
19770 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19771 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19772 ; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
19773 ; AVX512-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
19774 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm25
19775 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm10[4,5,6,7]
19776 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19777 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
19778 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19779 ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1}
19780 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm8
19781 ; AVX512-NEXT: vinserti128 $1, 192(%rdi), %ymm8, %ymm8
19782 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
19783 ; AVX512-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0
19784 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm0[0],ymm8[0],ymm0[2],ymm8[2]
19785 ; AVX512-NEXT: vinserti64x4 $0, %ymm28, %zmm17, %zmm26
19786 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19787 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1}
19788 ; AVX512-NEXT: vmovdqa 576(%rdi), %xmm13
19789 ; AVX512-NEXT: vinserti128 $1, 704(%rdi), %ymm13, %ymm13
19790 ; AVX512-NEXT: vmovdqa64 512(%rdi), %xmm28
19791 ; AVX512-NEXT: vinserti32x4 $1, 640(%rdi), %ymm28, %ymm28
19792 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm28[0],ymm13[0],ymm28[2],ymm13[2]
19793 ; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm1
19794 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19795 ; AVX512-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1}
19796 ; AVX512-NEXT: vmovdqa 1088(%rdi), %xmm4
19797 ; AVX512-NEXT: vinserti128 $1, 1216(%rdi), %ymm4, %ymm4
19798 ; AVX512-NEXT: vmovdqa 1024(%rdi), %xmm7
19799 ; AVX512-NEXT: vinserti128 $1, 1152(%rdi), %ymm7, %ymm7
19800 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm7[0],ymm4[0],ymm7[2],ymm4[2]
19801 ; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm24
19802 ; AVX512-NEXT: vmovdqa64 %zmm29, %zmm19 {%k1}
19803 ; AVX512-NEXT: vmovdqa 1600(%rdi), %xmm5
19804 ; AVX512-NEXT: vinserti128 $1, 1728(%rdi), %ymm5, %ymm5
19805 ; AVX512-NEXT: vmovdqa 1536(%rdi), %xmm11
19806 ; AVX512-NEXT: vinserti128 $1, 1664(%rdi), %ymm11, %ymm11
19807 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm11[0],ymm5[0],ymm11[2],ymm5[2]
19808 ; AVX512-NEXT: vinserti64x4 $0, %ymm29, %zmm19, %zmm19
19809 ; AVX512-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1}
19810 ; AVX512-NEXT: vmovdqa 2112(%rdi), %xmm6
19811 ; AVX512-NEXT: vinserti128 $1, 2240(%rdi), %ymm6, %ymm6
19812 ; AVX512-NEXT: vmovdqa64 2048(%rdi), %xmm29
19813 ; AVX512-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm29, %ymm29
19814 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm29[0],ymm6[0],ymm29[2],ymm6[2]
19815 ; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3
19816 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19817 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1}
19818 ; AVX512-NEXT: vmovdqa 2624(%rdi), %xmm10
19819 ; AVX512-NEXT: vinserti128 $1, 2752(%rdi), %ymm10, %ymm10
19820 ; AVX512-NEXT: vmovdqa64 2560(%rdi), %xmm23
19821 ; AVX512-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm23, %ymm23
19822 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm23[0],ymm10[0],ymm23[2],ymm10[2]
19823 ; AVX512-NEXT: vinserti64x4 $0, %ymm20, %zmm18, %zmm18
19824 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19825 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
19826 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1}
19827 ; AVX512-NEXT: vmovdqa64 3136(%rdi), %xmm17
19828 ; AVX512-NEXT: vinserti32x4 $1, 3264(%rdi), %ymm17, %ymm17
19829 ; AVX512-NEXT: vmovdqa64 3072(%rdi), %xmm20
19830 ; AVX512-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20
19831 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm20[0],ymm17[0],ymm20[2],ymm17[2]
19832 ; AVX512-NEXT: vinserti64x4 $0, %ymm15, %zmm1, %zmm15
19833 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19834 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1}
19835 ; AVX512-NEXT: vmovdqa 3648(%rdi), %xmm9
19836 ; AVX512-NEXT: vinserti128 $1, 3776(%rdi), %ymm9, %ymm9
19837 ; AVX512-NEXT: vmovdqa 3584(%rdi), %xmm14
19838 ; AVX512-NEXT: vinserti128 $1, 3712(%rdi), %ymm14, %ymm14
19839 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm14[0],ymm9[0],ymm14[2],ymm9[2]
19840 ; AVX512-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm1
19841 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19842 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
19843 ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm12 {%k1}
19844 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm20[1],ymm17[1],ymm20[3],ymm17[3]
19845 ; AVX512-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm16
19846 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19847 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
19848 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1}
19849 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm28[1],ymm13[1],ymm28[3],ymm13[3]
19850 ; AVX512-NEXT: vinserti64x4 $0, %ymm13, %zmm12, %zmm13
19851 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19852 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
19853 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1}
19854 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3]
19855 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0
19856 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19857 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm31 {%k1}
19858 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3]
19859 ; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm5
19860 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19861 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1}
19862 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],ymm4[1],ymm7[3],ymm4[3]
19863 ; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4
19864 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
19865 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1}
19866 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm10[1],ymm23[3],ymm10[3]
19867 ; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm30, %zmm7
19868 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19869 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1}
19870 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm29[1],ymm6[1],ymm29[3],ymm6[3]
19871 ; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6
19872 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19873 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1}
19874 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm14[1],ymm9[1],ymm14[3],ymm9[3]
19875 ; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2
19876 ; AVX512-NEXT: vmovdqa64 %zmm1, 448(%rsi)
19877 ; AVX512-NEXT: vmovdqa64 %zmm15, 384(%rsi)
19878 ; AVX512-NEXT: vmovdqa64 %zmm18, 320(%rsi)
19879 ; AVX512-NEXT: vmovdqa64 %zmm3, 256(%rsi)
19880 ; AVX512-NEXT: vmovdqa64 %zmm19, 192(%rsi)
19881 ; AVX512-NEXT: vmovdqa64 %zmm24, 128(%rsi)
19882 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19883 ; AVX512-NEXT: vmovaps %zmm1, 64(%rsi)
19884 ; AVX512-NEXT: vmovdqa64 %zmm26, (%rsi)
19885 ; AVX512-NEXT: vmovdqa64 %zmm2, 448(%rdx)
19886 ; AVX512-NEXT: vmovdqa64 %zmm6, 256(%rdx)
19887 ; AVX512-NEXT: vmovdqa64 %zmm7, 320(%rdx)
19888 ; AVX512-NEXT: vmovdqa64 %zmm4, 128(%rdx)
19889 ; AVX512-NEXT: vmovdqa64 %zmm5, 192(%rdx)
19890 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
19891 ; AVX512-NEXT: vmovdqa64 %zmm13, 64(%rdx)
19892 ; AVX512-NEXT: vmovdqa64 %zmm16, 384(%rdx)
19893 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19894 ; AVX512-NEXT: vmovaps %zmm0, 448(%rcx)
19895 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19896 ; AVX512-NEXT: vmovaps %zmm0, 256(%rcx)
19897 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19898 ; AVX512-NEXT: vmovaps %zmm0, 320(%rcx)
19899 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19900 ; AVX512-NEXT: vmovaps %zmm0, 128(%rcx)
19901 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19902 ; AVX512-NEXT: vmovaps %zmm0, 192(%rcx)
19903 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19904 ; AVX512-NEXT: vmovaps %zmm0, (%rcx)
19905 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19906 ; AVX512-NEXT: vmovaps %zmm0, 64(%rcx)
19907 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19908 ; AVX512-NEXT: vmovaps %zmm0, 384(%rcx)
19909 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19910 ; AVX512-NEXT: vmovaps %zmm0, 448(%r8)
19911 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19912 ; AVX512-NEXT: vmovaps %zmm0, 256(%r8)
19913 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19914 ; AVX512-NEXT: vmovaps %zmm0, 320(%r8)
19915 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19916 ; AVX512-NEXT: vmovaps %zmm0, 128(%r8)
19917 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19918 ; AVX512-NEXT: vmovaps %zmm0, 192(%r8)
19919 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19920 ; AVX512-NEXT: vmovaps %zmm0, (%r8)
19921 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19922 ; AVX512-NEXT: vmovaps %zmm0, 64(%r8)
19923 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19924 ; AVX512-NEXT: vmovaps %zmm0, 384(%r8)
19925 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19926 ; AVX512-NEXT: vmovaps %zmm0, 448(%r9)
19927 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19928 ; AVX512-NEXT: vmovaps %zmm0, 256(%r9)
19929 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19930 ; AVX512-NEXT: vmovaps %zmm0, 320(%r9)
19931 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19932 ; AVX512-NEXT: vmovaps %zmm0, 128(%r9)
19933 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19934 ; AVX512-NEXT: vmovaps %zmm0, 192(%r9)
19935 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19936 ; AVX512-NEXT: vmovaps %zmm0, (%r9)
19937 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19938 ; AVX512-NEXT: vmovaps %zmm0, 64(%r9)
19939 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19940 ; AVX512-NEXT: vmovaps %zmm0, 384(%r9)
19941 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
19942 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19943 ; AVX512-NEXT: vmovaps %zmm0, 448(%rax)
19944 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19945 ; AVX512-NEXT: vmovaps %zmm0, 256(%rax)
19946 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19947 ; AVX512-NEXT: vmovaps %zmm0, 320(%rax)
19948 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19949 ; AVX512-NEXT: vmovaps %zmm0, 128(%rax)
19950 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19951 ; AVX512-NEXT: vmovaps %zmm0, 192(%rax)
19952 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19953 ; AVX512-NEXT: vmovaps %zmm0, (%rax)
19954 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19955 ; AVX512-NEXT: vmovaps %zmm0, 64(%rax)
19956 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19957 ; AVX512-NEXT: vmovaps %zmm0, 384(%rax)
19958 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
19959 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19960 ; AVX512-NEXT: vmovaps %zmm0, 448(%rax)
19961 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19962 ; AVX512-NEXT: vmovaps %zmm0, 256(%rax)
19963 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19964 ; AVX512-NEXT: vmovaps %zmm0, 320(%rax)
19965 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19966 ; AVX512-NEXT: vmovaps %zmm0, 128(%rax)
19967 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19968 ; AVX512-NEXT: vmovaps %zmm0, 192(%rax)
19969 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19970 ; AVX512-NEXT: vmovaps %zmm0, (%rax)
19971 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19972 ; AVX512-NEXT: vmovaps %zmm0, 64(%rax)
19973 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19974 ; AVX512-NEXT: vmovaps %zmm0, 384(%rax)
19975 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
19976 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19977 ; AVX512-NEXT: vmovaps %zmm0, 384(%rax)
19978 ; AVX512-NEXT: vmovdqa64 %zmm25, 448(%rax)
19979 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19980 ; AVX512-NEXT: vmovaps %zmm0, 256(%rax)
19981 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19982 ; AVX512-NEXT: vmovaps %zmm0, 320(%rax)
19983 ; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
19984 ; AVX512-NEXT: vmovaps %zmm0, 128(%rax)
19985 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19986 ; AVX512-NEXT: vmovaps %zmm0, 192(%rax)
19987 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19988 ; AVX512-NEXT: vmovaps %zmm0, (%rax)
19989 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19990 ; AVX512-NEXT: vmovaps %zmm0, 64(%rax)
19991 ; AVX512-NEXT: addq $6664, %rsp # imm = 0x1A08
19992 ; AVX512-NEXT: vzeroupper
19993 ; AVX512-NEXT: retq
19995 ; AVX512-FCP-LABEL: load_i64_stride8_vf64:
19996 ; AVX512-FCP: # %bb.0:
19997 ; AVX512-FCP-NEXT: subq $6664, %rsp # imm = 0x1A08
19998 ; AVX512-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm3
19999 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20000 ; AVX512-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm16
20001 ; AVX512-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm8
20002 ; AVX512-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm28
20003 ; AVX512-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm10
20004 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20005 ; AVX512-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm11
20006 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20007 ; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4
20008 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20009 ; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5
20010 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20011 ; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm6
20012 ; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm9
20013 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20014 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm12
20015 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20016 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15
20017 ; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20018 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm13
20019 ; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20020 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14
20021 ; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20022 ; AVX512-FCP-NEXT: movb $-64, %al
20023 ; AVX512-FCP-NEXT: kmovw %eax, %k1
20024 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
20025 ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
20026 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
20027 ; AVX512-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20028 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm0
20029 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20030 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm1
20031 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
20032 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
20033 ; AVX512-FCP-NEXT: vmovdqa 3264(%rdi), %ymm3
20034 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
20035 ; AVX512-FCP-NEXT: vmovdqa 3200(%rdi), %ymm0
20036 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
20037 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
20038 ; AVX512-FCP-NEXT: vmovdqa 3136(%rdi), %ymm3
20039 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
20040 ; AVX512-FCP-NEXT: vmovdqa 3072(%rdi), %ymm7
20041 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2]
20042 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
20043 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
20044 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20045 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm0
20046 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0
20047 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm9
20048 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20049 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1
20050 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm1
20051 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
20052 ; AVX512-FCP-NEXT: vmovdqa 704(%rdi), %ymm0
20053 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
20054 ; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %ymm20
20055 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm20[0],ymm0[0],ymm20[2],ymm0[2]
20056 ; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %ymm22
20057 ; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %ymm19
20058 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2]
20059 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
20060 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
20061 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20062 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm0
20063 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm0
20064 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm1
20065 ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm1
20066 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
20067 ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm14
20068 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm15
20069 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
20070 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm21
20071 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm13
20072 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2]
20073 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
20074 ; AVX512-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3
20075 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20076 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
20077 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20078 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0
20079 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm0
20080 ; AVX512-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm1
20081 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20082 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm1
20083 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
20084 ; AVX512-FCP-NEXT: vmovdqa 1728(%rdi), %ymm3
20085 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
20086 ; AVX512-FCP-NEXT: vmovdqa 1664(%rdi), %ymm0
20087 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
20088 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
20089 ; AVX512-FCP-NEXT: vmovdqa 1600(%rdi), %ymm4
20090 ; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
20091 ; AVX512-FCP-NEXT: vmovdqa 1536(%rdi), %ymm3
20092 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
20093 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
20094 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
20095 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
20096 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20097 ; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm1
20098 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20099 ; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0
20100 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20101 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
20102 ; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm3
20103 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20104 ; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm1
20105 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
20106 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
20107 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
20108 ; AVX512-FCP-NEXT: vmovdqa 1216(%rdi), %ymm3
20109 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
20110 ; AVX512-FCP-NEXT: vmovdqa 1152(%rdi), %ymm0
20111 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
20112 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
20113 ; AVX512-FCP-NEXT: vmovdqa 1088(%rdi), %ymm4
20114 ; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
20115 ; AVX512-FCP-NEXT: vmovdqa 1024(%rdi), %ymm3
20116 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
20117 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
20118 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
20119 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
20120 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20121 ; AVX512-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm1
20122 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20123 ; AVX512-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm29
20124 ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm0
20125 ; AVX512-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20126 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
20127 ; AVX512-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm24
20128 ; AVX512-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm25
20129 ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm1
20130 ; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20131 ; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm1
20132 ; AVX512-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20133 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
20134 ; AVX512-FCP-NEXT: vmovdqa64 2752(%rdi), %ymm27
20135 ; AVX512-FCP-NEXT: vmovdqa64 2688(%rdi), %ymm26
20136 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm27[0],ymm26[2],ymm27[2]
20137 ; AVX512-FCP-NEXT: vmovdqa64 2624(%rdi), %ymm30
20138 ; AVX512-FCP-NEXT: vmovdqa64 2560(%rdi), %ymm18
20139 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm30[0],ymm18[2],ymm30[2]
20140 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
20141 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
20142 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20143 ; AVX512-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm1
20144 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20145 ; AVX512-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm31
20146 ; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm0
20147 ; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20148 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
20149 ; AVX512-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm3
20150 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20151 ; AVX512-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm1
20152 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20153 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
20154 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
20155 ; AVX512-FCP-NEXT: vmovdqa 2240(%rdi), %ymm12
20156 ; AVX512-FCP-NEXT: vmovdqa 2176(%rdi), %ymm11
20157 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2]
20158 ; AVX512-FCP-NEXT: vmovdqa 2112(%rdi), %ymm10
20159 ; AVX512-FCP-NEXT: vmovdqa 2048(%rdi), %ymm3
20160 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm10[0],ymm3[2],ymm10[2]
20161 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3]
20162 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
20163 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20164 ; AVX512-FCP-NEXT: vmovdqa64 4032(%rdi), %zmm1
20165 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20166 ; AVX512-FCP-NEXT: vmovdqa64 3968(%rdi), %zmm0
20167 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20168 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
20169 ; AVX512-FCP-NEXT: vmovdqa64 3904(%rdi), %zmm1
20170 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20171 ; AVX512-FCP-NEXT: vmovdqa64 3840(%rdi), %zmm4
20172 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20173 ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm2
20174 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
20175 ; AVX512-FCP-NEXT: vmovdqa64 3776(%rdi), %ymm17
20176 ; AVX512-FCP-NEXT: vmovdqa64 3712(%rdi), %ymm23
20177 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm23[0],ymm17[0],ymm23[2],ymm17[2]
20178 ; AVX512-FCP-NEXT: vmovdqa 3648(%rdi), %ymm1
20179 ; AVX512-FCP-NEXT: vmovdqa 3584(%rdi), %ymm0
20180 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
20181 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3]
20182 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
20183 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20184 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
20185 ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
20186 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm28
20187 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm8
20188 ; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20189 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm6
20190 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
20191 ; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm6
20192 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm6 {%k1}
20193 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
20194 ; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload
20195 ; AVX512-FCP-NEXT: # ymm5 = ymm4[1],mem[1],ymm4[3],mem[3]
20196 ; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
20197 ; AVX512-FCP-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
20198 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
20199 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
20200 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20201 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
20202 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm5
20203 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm5
20204 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
20205 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm6
20206 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
20207 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm6
20208 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
20209 ; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload
20210 ; AVX512-FCP-NEXT: # ymm5 = ymm20[1],mem[1],ymm20[3],mem[3]
20211 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm19[1],ymm22[1],ymm19[3],ymm22[3]
20212 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
20213 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
20214 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20215 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
20216 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm5
20217 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
20218 ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm5
20219 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20220 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
20221 ; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm2, %zmm6
20222 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
20223 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
20224 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
20225 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
20226 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
20227 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20228 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
20229 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm5
20230 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
20231 ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm5
20232 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
20233 ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm6
20234 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
20235 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm6
20236 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
20237 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
20238 ; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
20239 ; AVX512-FCP-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
20240 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
20241 ; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
20242 ; AVX512-FCP-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
20243 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
20244 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
20245 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20246 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
20247 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload
20248 ; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
20249 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload
20250 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
20251 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
20252 ; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
20253 ; AVX512-FCP-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
20254 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
20255 ; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
20256 ; AVX512-FCP-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
20257 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
20258 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
20259 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20260 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm29 # 64-byte Folded Reload
20261 ; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm25
20262 ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1}
20263 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm27[1],ymm26[3],ymm27[3]
20264 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm18[1],ymm30[1],ymm18[3],ymm30[3]
20265 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
20266 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm25, %zmm5
20267 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20268 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
20269 ; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm31
20270 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
20271 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm6
20272 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload
20273 ; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1}
20274 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm12[1],ymm11[3],ymm12[3]
20275 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm10[1],ymm3[3],ymm10[3]
20276 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3]
20277 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3
20278 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20279 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
20280 ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm3
20281 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
20282 ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm3
20283 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
20284 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
20285 ; AVX512-FCP-NEXT: vpermi2q %zmm11, %zmm12, %zmm2
20286 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1}
20287 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm17[1],ymm23[3],ymm17[3]
20288 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
20289 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
20290 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
20291 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20292 ; AVX512-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm3
20293 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20294 ; AVX512-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm10
20295 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
20296 ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
20297 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm1
20298 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1
20299 ; AVX512-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm23
20300 ; AVX512-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm2
20301 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20302 ; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2
20303 ; AVX512-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20304 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
20305 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
20306 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm2
20307 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
20308 ; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2
20309 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6]
20310 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
20311 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
20312 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20313 ; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm2
20314 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20315 ; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm1
20316 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20317 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
20318 ; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm3
20319 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20320 ; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2
20321 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20322 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
20323 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
20324 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm2
20325 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
20326 ; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm2
20327 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm4[0],zmm9[2],zmm4[2],zmm9[4],zmm4[4],zmm9[6],zmm4[6]
20328 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
20329 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
20330 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20331 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2
20332 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20333 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
20334 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20335 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
20336 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3
20337 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20338 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
20339 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20340 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
20341 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
20342 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm2
20343 ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2
20344 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm16
20345 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
20346 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm22[0],zmm3[2],zmm22[2],zmm3[4],zmm22[4],zmm3[6],zmm22[6]
20347 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
20348 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
20349 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20350 ; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2
20351 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20352 ; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm1
20353 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20354 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
20355 ; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm3
20356 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20357 ; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm2
20358 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20359 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
20360 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
20361 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm2
20362 ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
20363 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm13[0],zmm21[2],zmm13[2],zmm21[4],zmm13[4],zmm21[6],zmm13[6]
20364 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
20365 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
20366 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20367 ; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm2
20368 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20369 ; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1
20370 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20371 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
20372 ; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm3
20373 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20374 ; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm2
20375 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20376 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
20377 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
20378 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
20379 ; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm2
20380 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
20381 ; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload
20382 ; AVX512-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload
20383 ; AVX512-FCP-NEXT: # zmm3 = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6]
20384 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
20385 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
20386 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20387 ; AVX512-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm3
20388 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20389 ; AVX512-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm8
20390 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
20391 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20392 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1
20393 ; AVX512-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm3
20394 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20395 ; AVX512-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm2
20396 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20397 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
20398 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
20399 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
20400 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm2
20401 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
20402 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
20403 ; AVX512-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload
20404 ; AVX512-FCP-NEXT: # zmm3 = zmm7[0],mem[0],zmm7[2],mem[2],zmm7[4],mem[4],zmm7[6],mem[6]
20405 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
20406 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
20407 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20408 ; AVX512-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm2
20409 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20410 ; AVX512-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm1
20411 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20412 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
20413 ; AVX512-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm3
20414 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20415 ; AVX512-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm2
20416 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20417 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
20418 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
20419 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
20420 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm2
20421 ; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
20422 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
20423 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm26[0],zmm4[0],zmm26[2],zmm4[2],zmm26[4],zmm4[4],zmm26[6],zmm4[6]
20424 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
20425 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
20426 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20427 ; AVX512-FCP-NEXT: vmovdqa64 3776(%rdi), %zmm2
20428 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20429 ; AVX512-FCP-NEXT: vmovdqa64 3712(%rdi), %zmm1
20430 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20431 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
20432 ; AVX512-FCP-NEXT: vmovdqa64 3648(%rdi), %zmm3
20433 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20434 ; AVX512-FCP-NEXT: vmovdqa64 3584(%rdi), %zmm2
20435 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20436 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
20437 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
20438 ; AVX512-FCP-NEXT: vpermi2q %zmm18, %zmm29, %zmm0
20439 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm12[0],zmm11[0],zmm12[2],zmm11[2],zmm12[4],zmm11[4],zmm12[6],zmm11[6]
20440 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
20441 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0
20442 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20443 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
20444 ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
20445 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20446 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
20447 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm10
20448 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20449 ; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2
20450 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7]
20451 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm2
20452 ; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2
20453 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
20454 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
20455 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
20456 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
20457 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
20458 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20459 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20460 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
20461 ; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm1
20462 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20463 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
20464 ; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
20465 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
20466 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm29
20467 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm2
20468 ; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm2
20469 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
20470 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
20471 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7]
20472 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
20473 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
20474 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20475 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20476 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
20477 ; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1
20478 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20479 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
20480 ; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2
20481 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
20482 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20483 ; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
20484 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
20485 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
20486 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm12[1],zmm18[1],zmm12[3],zmm18[3],zmm12[5],zmm18[5],zmm12[7],zmm18[7]
20487 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
20488 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
20489 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20490 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20491 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
20492 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
20493 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20494 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
20495 ; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
20496 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
20497 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm2
20498 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
20499 ; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2
20500 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
20501 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm13[1],zmm3[3],zmm13[3],zmm3[5],zmm13[5],zmm3[7],zmm13[7]
20502 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
20503 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
20504 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20505 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20506 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
20507 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1
20508 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20509 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
20510 ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm2
20511 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
20512 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload
20513 ; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload
20514 ; AVX512-FCP-NEXT: # zmm3 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7]
20515 ; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1}
20516 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
20517 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20518 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
20519 ; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm8
20520 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20521 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
20522 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
20523 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm8[4,5,6,7]
20524 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload
20525 ; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload
20526 ; AVX512-FCP-NEXT: # zmm3 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7]
20527 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1}
20528 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
20529 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20530 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20531 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
20532 ; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm1
20533 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
20534 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm2
20535 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20536 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm2
20537 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
20538 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
20539 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
20540 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7]
20541 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1}
20542 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
20543 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20544 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
20545 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1
20546 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
20547 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm1
20548 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
20549 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm2
20550 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20551 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2
20552 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
20553 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
20554 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
20555 ; AVX512-FCP-NEXT: vpermi2q %zmm10, %zmm9, %zmm0
20556 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20557 ; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload
20558 ; AVX512-FCP-NEXT: # zmm2 = zmm2[1],mem[1],zmm2[3],mem[3],zmm2[5],mem[5],zmm2[7],mem[7]
20559 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
20560 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0
20561 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20562 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
20563 ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
20564 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20565 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm1
20566 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20567 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
20568 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
20569 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm2
20570 ; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2
20571 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
20572 ; AVX512-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 {%k1} # 64-byte Folded Reload
20573 ; AVX512-FCP-NEXT: # zmm2 {%k1} = zmm25[0],mem[0],zmm25[2],mem[2],zmm25[4],mem[4],zmm25[6],mem[6]
20574 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
20575 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20576 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20577 ; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm1
20578 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20579 ; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
20580 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
20581 ; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm0, %zmm15
20582 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm29[0],zmm30[0],zmm29[2],zmm30[2],zmm29[4],zmm30[4],zmm29[6],zmm30[6]
20583 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1
20584 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20585 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20586 ; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1
20587 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20588 ; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2
20589 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
20590 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm2
20591 ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2
20592 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
20593 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
20594 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm30[0],zmm26[0],zmm30[2],zmm26[2],zmm30[4],zmm26[4],zmm30[6],zmm26[6]
20595 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
20596 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20597 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20598 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
20599 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20600 ; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
20601 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
20602 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
20603 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm2
20604 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
20605 ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm2
20606 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
20607 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm24[0],zmm23[0],zmm24[2],zmm23[2],zmm24[4],zmm23[4],zmm24[6],zmm23[6]
20608 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
20609 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20610 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20611 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1
20612 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20613 ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm2
20614 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
20615 ; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
20616 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
20617 ; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2
20618 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
20619 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
20620 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
20621 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
20622 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20623 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20624 ; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm1
20625 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20626 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
20627 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
20628 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20629 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
20630 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
20631 ; AVX512-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm2 {%k1} # 64-byte Folded Reload
20632 ; AVX512-FCP-NEXT: # zmm2 {%k1} = zmm11[0],mem[0],zmm11[2],mem[2],zmm11[4],mem[4],zmm11[6],mem[6]
20633 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
20634 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20635 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20636 ; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm1
20637 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm7
20638 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
20639 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
20640 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm2
20641 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
20642 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
20643 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
20644 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm28[0],zmm7[2],zmm28[2],zmm7[4],zmm28[4],zmm7[6],zmm28[6]
20645 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
20646 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20647 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm3
20648 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm4
20649 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7]
20650 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
20651 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
20652 ; AVX512-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm0
20653 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6]
20654 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
20655 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20656 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm1
20657 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm0
20658 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15]
20659 ; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
20660 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
20661 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm19
20662 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
20663 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
20664 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7]
20665 ; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20666 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm6
20667 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8]
20668 ; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
20669 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm17
20670 ; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20671 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9]
20672 ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
20673 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm6
20674 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20675 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm1
20676 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20677 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm0
20678 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20679 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20680 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm20
20681 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0
20682 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
20683 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm1
20684 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
20685 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
20686 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm11[1],zmm13[3],zmm11[3],zmm13[5],zmm11[5],zmm13[7],zmm11[7]
20687 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20688 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm1
20689 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm12, %zmm13
20690 ; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20691 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm1
20692 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20693 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm20
20694 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm0
20695 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20696 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20697 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm17
20698 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0
20699 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
20700 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm5, %zmm1
20701 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm10
20702 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm30[1],zmm26[1],zmm30[3],zmm26[3],zmm30[5],zmm26[5],zmm30[7],zmm26[7]
20703 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20704 ; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm1
20705 ; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm26
20706 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm12, %zmm26
20707 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm1
20708 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20709 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm17
20710 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm0
20711 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20712 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm19
20713 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm31
20714 ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm3
20715 ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm5, %zmm16
20716 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20717 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm24[1],zmm6[1],zmm24[3],zmm6[3],zmm24[5],zmm6[5],zmm24[7],zmm6[7]
20718 ; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20719 ; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm0
20720 ; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm29
20721 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm29
20722 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0
20723 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20724 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm19
20725 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm31
20726 ; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
20727 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm16
20728 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm27
20729 ; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm1
20730 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7]
20731 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
20732 ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm1
20733 ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm24
20734 ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm24
20735 ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm1
20736 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20737 ; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm12, %zmm16
20738 ; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm2, %zmm27
20739 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20740 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm18
20741 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm30
20742 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20743 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm1
20744 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20745 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
20746 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm10[1],zmm0[1],zmm10[3],zmm0[3],zmm10[5],zmm0[5],zmm10[7],zmm0[7]
20747 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20748 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm1
20749 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm10
20750 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20751 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
20752 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20753 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm18
20754 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm30
20755 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm6
20756 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm23
20757 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20758 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm6
20759 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
20760 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
20761 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm21[1],zmm28[1],zmm21[3],zmm28[3],zmm21[5],zmm28[5],zmm21[7],zmm28[7]
20762 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm28
20763 ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm1
20764 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm21
20765 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
20766 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20767 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm23
20768 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm22
20769 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
20770 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm0
20771 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20772 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm0
20773 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20774 ; AVX512-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm12
20775 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm3
20776 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm3
20777 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20778 ; AVX512-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm2
20779 ; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm15
20780 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm14[1],zmm8[1],zmm14[3],zmm8[3],zmm14[5],zmm8[5],zmm14[7],zmm8[7]
20781 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm25
20782 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
20783 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload
20784 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20785 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload
20786 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20787 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload
20788 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20789 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload
20790 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
20791 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 # 64-byte Folded Reload
20792 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
20793 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload
20794 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
20795 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload
20796 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
20797 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload
20798 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
20799 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload
20800 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20801 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload
20802 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
20803 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload
20804 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
20805 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
20806 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20807 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
20808 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
20809 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20810 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
20811 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
20812 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20813 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
20814 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
20815 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
20816 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
20817 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
20818 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
20819 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
20820 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20821 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm8[4,5,6,7]
20822 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20823 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
20824 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20825 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7]
20826 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20827 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
20828 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20829 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5,6,7]
20830 ; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
20831 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
20832 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
20833 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5,6,7]
20834 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20835 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
20836 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20837 ; AVX512-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload
20838 ; AVX512-FCP-NEXT: # ymm0 = mem[0,1,2,3],ymm15[4,5,6,7]
20839 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0
20840 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20841 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20842 ; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
20843 ; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
20844 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm25
20845 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm10[4,5,6,7]
20846 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20847 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
20848 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20849 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1}
20850 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm8
20851 ; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm8, %ymm8
20852 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
20853 ; AVX512-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0
20854 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm0[0],ymm8[0],ymm0[2],ymm8[2]
20855 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm28, %zmm17, %zmm26
20856 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20857 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1}
20858 ; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %xmm13
20859 ; AVX512-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm13, %ymm13
20860 ; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %xmm28
20861 ; AVX512-FCP-NEXT: vinserti32x4 $1, 640(%rdi), %ymm28, %ymm28
20862 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm28[0],ymm13[0],ymm28[2],ymm13[2]
20863 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm1
20864 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20865 ; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1}
20866 ; AVX512-FCP-NEXT: vmovdqa 1088(%rdi), %xmm4
20867 ; AVX512-FCP-NEXT: vinserti128 $1, 1216(%rdi), %ymm4, %ymm4
20868 ; AVX512-FCP-NEXT: vmovdqa 1024(%rdi), %xmm7
20869 ; AVX512-FCP-NEXT: vinserti128 $1, 1152(%rdi), %ymm7, %ymm7
20870 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm7[0],ymm4[0],ymm7[2],ymm4[2]
20871 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm24
20872 ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm19 {%k1}
20873 ; AVX512-FCP-NEXT: vmovdqa 1600(%rdi), %xmm5
20874 ; AVX512-FCP-NEXT: vinserti128 $1, 1728(%rdi), %ymm5, %ymm5
20875 ; AVX512-FCP-NEXT: vmovdqa 1536(%rdi), %xmm11
20876 ; AVX512-FCP-NEXT: vinserti128 $1, 1664(%rdi), %ymm11, %ymm11
20877 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm11[0],ymm5[0],ymm11[2],ymm5[2]
20878 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm29, %zmm19, %zmm19
20879 ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1}
20880 ; AVX512-FCP-NEXT: vmovdqa 2112(%rdi), %xmm6
20881 ; AVX512-FCP-NEXT: vinserti128 $1, 2240(%rdi), %ymm6, %ymm6
20882 ; AVX512-FCP-NEXT: vmovdqa64 2048(%rdi), %xmm29
20883 ; AVX512-FCP-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm29, %ymm29
20884 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm29[0],ymm6[0],ymm29[2],ymm6[2]
20885 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3
20886 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20887 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1}
20888 ; AVX512-FCP-NEXT: vmovdqa 2624(%rdi), %xmm10
20889 ; AVX512-FCP-NEXT: vinserti128 $1, 2752(%rdi), %ymm10, %ymm10
20890 ; AVX512-FCP-NEXT: vmovdqa64 2560(%rdi), %xmm23
20891 ; AVX512-FCP-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm23, %ymm23
20892 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm23[0],ymm10[0],ymm23[2],ymm10[2]
20893 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm18, %zmm18
20894 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20895 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
20896 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1}
20897 ; AVX512-FCP-NEXT: vmovdqa64 3136(%rdi), %xmm17
20898 ; AVX512-FCP-NEXT: vinserti32x4 $1, 3264(%rdi), %ymm17, %ymm17
20899 ; AVX512-FCP-NEXT: vmovdqa64 3072(%rdi), %xmm20
20900 ; AVX512-FCP-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20
20901 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm20[0],ymm17[0],ymm20[2],ymm17[2]
20902 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm1, %zmm15
20903 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20904 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1}
20905 ; AVX512-FCP-NEXT: vmovdqa 3648(%rdi), %xmm9
20906 ; AVX512-FCP-NEXT: vinserti128 $1, 3776(%rdi), %ymm9, %ymm9
20907 ; AVX512-FCP-NEXT: vmovdqa 3584(%rdi), %xmm14
20908 ; AVX512-FCP-NEXT: vinserti128 $1, 3712(%rdi), %ymm14, %ymm14
20909 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm14[0],ymm9[0],ymm14[2],ymm9[2]
20910 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm1
20911 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
20912 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
20913 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 {%k1}
20914 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm20[1],ymm17[1],ymm20[3],ymm17[3]
20915 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm16
20916 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
20917 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
20918 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1}
20919 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm28[1],ymm13[1],ymm28[3],ymm13[3]
20920 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm12, %zmm13
20921 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
20922 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
20923 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1}
20924 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3]
20925 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0
20926 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20927 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm31 {%k1}
20928 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3]
20929 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm5
20930 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20931 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1}
20932 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],ymm4[1],ymm7[3],ymm4[3]
20933 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4
20934 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
20935 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1}
20936 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm10[1],ymm23[3],ymm10[3]
20937 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm30, %zmm7
20938 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20939 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1}
20940 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm29[1],ymm6[1],ymm29[3],ymm6[3]
20941 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6
20942 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20943 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1}
20944 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm14[1],ymm9[1],ymm14[3],ymm9[3]
20945 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2
20946 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 448(%rsi)
20947 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 384(%rsi)
20948 ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 320(%rsi)
20949 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 256(%rsi)
20950 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 192(%rsi)
20951 ; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 128(%rsi)
20952 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20953 ; AVX512-FCP-NEXT: vmovaps %zmm1, 64(%rsi)
20954 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, (%rsi)
20955 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 448(%rdx)
20956 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 256(%rdx)
20957 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 320(%rdx)
20958 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 128(%rdx)
20959 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 192(%rdx)
20960 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
20961 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 64(%rdx)
20962 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 384(%rdx)
20963 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20964 ; AVX512-FCP-NEXT: vmovaps %zmm0, 448(%rcx)
20965 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20966 ; AVX512-FCP-NEXT: vmovaps %zmm0, 256(%rcx)
20967 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20968 ; AVX512-FCP-NEXT: vmovaps %zmm0, 320(%rcx)
20969 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20970 ; AVX512-FCP-NEXT: vmovaps %zmm0, 128(%rcx)
20971 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20972 ; AVX512-FCP-NEXT: vmovaps %zmm0, 192(%rcx)
20973 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20974 ; AVX512-FCP-NEXT: vmovaps %zmm0, (%rcx)
20975 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20976 ; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%rcx)
20977 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20978 ; AVX512-FCP-NEXT: vmovaps %zmm0, 384(%rcx)
20979 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20980 ; AVX512-FCP-NEXT: vmovaps %zmm0, 448(%r8)
20981 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20982 ; AVX512-FCP-NEXT: vmovaps %zmm0, 256(%r8)
20983 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20984 ; AVX512-FCP-NEXT: vmovaps %zmm0, 320(%r8)
20985 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20986 ; AVX512-FCP-NEXT: vmovaps %zmm0, 128(%r8)
20987 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20988 ; AVX512-FCP-NEXT: vmovaps %zmm0, 192(%r8)
20989 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20990 ; AVX512-FCP-NEXT: vmovaps %zmm0, (%r8)
20991 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20992 ; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%r8)
20993 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20994 ; AVX512-FCP-NEXT: vmovaps %zmm0, 384(%r8)
20995 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20996 ; AVX512-FCP-NEXT: vmovaps %zmm0, 448(%r9)
20997 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20998 ; AVX512-FCP-NEXT: vmovaps %zmm0, 256(%r9)
20999 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21000 ; AVX512-FCP-NEXT: vmovaps %zmm0, 320(%r9)
21001 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21002 ; AVX512-FCP-NEXT: vmovaps %zmm0, 128(%r9)
21003 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21004 ; AVX512-FCP-NEXT: vmovaps %zmm0, 192(%r9)
21005 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21006 ; AVX512-FCP-NEXT: vmovaps %zmm0, (%r9)
21007 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21008 ; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%r9)
21009 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21010 ; AVX512-FCP-NEXT: vmovaps %zmm0, 384(%r9)
21011 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
21012 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21013 ; AVX512-FCP-NEXT: vmovaps %zmm0, 448(%rax)
21014 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21015 ; AVX512-FCP-NEXT: vmovaps %zmm0, 256(%rax)
21016 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21017 ; AVX512-FCP-NEXT: vmovaps %zmm0, 320(%rax)
21018 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21019 ; AVX512-FCP-NEXT: vmovaps %zmm0, 128(%rax)
21020 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21021 ; AVX512-FCP-NEXT: vmovaps %zmm0, 192(%rax)
21022 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21023 ; AVX512-FCP-NEXT: vmovaps %zmm0, (%rax)
21024 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21025 ; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%rax)
21026 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21027 ; AVX512-FCP-NEXT: vmovaps %zmm0, 384(%rax)
21028 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
21029 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21030 ; AVX512-FCP-NEXT: vmovaps %zmm0, 448(%rax)
21031 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21032 ; AVX512-FCP-NEXT: vmovaps %zmm0, 256(%rax)
21033 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21034 ; AVX512-FCP-NEXT: vmovaps %zmm0, 320(%rax)
21035 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21036 ; AVX512-FCP-NEXT: vmovaps %zmm0, 128(%rax)
21037 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21038 ; AVX512-FCP-NEXT: vmovaps %zmm0, 192(%rax)
21039 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21040 ; AVX512-FCP-NEXT: vmovaps %zmm0, (%rax)
21041 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21042 ; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%rax)
21043 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21044 ; AVX512-FCP-NEXT: vmovaps %zmm0, 384(%rax)
21045 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
21046 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21047 ; AVX512-FCP-NEXT: vmovaps %zmm0, 384(%rax)
21048 ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 448(%rax)
21049 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21050 ; AVX512-FCP-NEXT: vmovaps %zmm0, 256(%rax)
21051 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21052 ; AVX512-FCP-NEXT: vmovaps %zmm0, 320(%rax)
21053 ; AVX512-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
21054 ; AVX512-FCP-NEXT: vmovaps %zmm0, 128(%rax)
21055 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21056 ; AVX512-FCP-NEXT: vmovaps %zmm0, 192(%rax)
21057 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21058 ; AVX512-FCP-NEXT: vmovaps %zmm0, (%rax)
21059 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21060 ; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%rax)
21061 ; AVX512-FCP-NEXT: addq $6664, %rsp # imm = 0x1A08
21062 ; AVX512-FCP-NEXT: vzeroupper
21063 ; AVX512-FCP-NEXT: retq
21065 ; AVX512DQ-LABEL: load_i64_stride8_vf64:
21066 ; AVX512DQ: # %bb.0:
21067 ; AVX512DQ-NEXT: subq $6664, %rsp # imm = 0x1A08
21068 ; AVX512DQ-NEXT: vmovdqa64 3392(%rdi), %zmm3
21069 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21070 ; AVX512DQ-NEXT: vmovdqa64 3328(%rdi), %zmm16
21071 ; AVX512DQ-NEXT: vmovdqa64 3520(%rdi), %zmm8
21072 ; AVX512DQ-NEXT: vmovdqa64 3456(%rdi), %zmm28
21073 ; AVX512DQ-NEXT: vmovdqa64 1856(%rdi), %zmm10
21074 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21075 ; AVX512DQ-NEXT: vmovdqa64 1984(%rdi), %zmm11
21076 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21077 ; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm4
21078 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21079 ; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm5
21080 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21081 ; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm6
21082 ; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm9
21083 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21084 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm12
21085 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21086 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm15
21087 ; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21088 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm13
21089 ; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21090 ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm14
21091 ; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21092 ; AVX512DQ-NEXT: movb $-64, %al
21093 ; AVX512DQ-NEXT: kmovw %eax, %k1
21094 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
21095 ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
21096 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm0
21097 ; AVX512DQ-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21098 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm2, %zmm0
21099 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21100 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm1
21101 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
21102 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
21103 ; AVX512DQ-NEXT: vmovdqa 3264(%rdi), %ymm3
21104 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
21105 ; AVX512DQ-NEXT: vmovdqa 3200(%rdi), %ymm0
21106 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
21107 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
21108 ; AVX512DQ-NEXT: vmovdqa 3136(%rdi), %ymm3
21109 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
21110 ; AVX512DQ-NEXT: vmovdqa 3072(%rdi), %ymm7
21111 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2]
21112 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
21113 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
21114 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21115 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm0
21116 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm2, %zmm0
21117 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm9
21118 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21119 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1
21120 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm2, %zmm1
21121 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
21122 ; AVX512DQ-NEXT: vmovdqa 704(%rdi), %ymm0
21123 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
21124 ; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %ymm20
21125 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm20[0],ymm0[0],ymm20[2],ymm0[2]
21126 ; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %ymm22
21127 ; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %ymm19
21128 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2]
21129 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
21130 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
21131 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21132 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm0
21133 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm2, %zmm0
21134 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm1
21135 ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm2, %zmm1
21136 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
21137 ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm14
21138 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm15
21139 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
21140 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm21
21141 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm13
21142 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2]
21143 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
21144 ; AVX512DQ-NEXT: vmovdqa64 1920(%rdi), %zmm3
21145 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21146 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
21147 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21148 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0
21149 ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm2, %zmm0
21150 ; AVX512DQ-NEXT: vmovdqa64 1792(%rdi), %zmm1
21151 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21152 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm2, %zmm1
21153 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
21154 ; AVX512DQ-NEXT: vmovdqa 1728(%rdi), %ymm3
21155 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
21156 ; AVX512DQ-NEXT: vmovdqa 1664(%rdi), %ymm0
21157 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
21158 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
21159 ; AVX512DQ-NEXT: vmovdqa 1600(%rdi), %ymm4
21160 ; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
21161 ; AVX512DQ-NEXT: vmovdqa 1536(%rdi), %ymm3
21162 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
21163 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
21164 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
21165 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
21166 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21167 ; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm1
21168 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21169 ; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm0
21170 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21171 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
21172 ; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm3
21173 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21174 ; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm1
21175 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
21176 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
21177 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
21178 ; AVX512DQ-NEXT: vmovdqa 1216(%rdi), %ymm3
21179 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
21180 ; AVX512DQ-NEXT: vmovdqa 1152(%rdi), %ymm0
21181 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
21182 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
21183 ; AVX512DQ-NEXT: vmovdqa 1088(%rdi), %ymm4
21184 ; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
21185 ; AVX512DQ-NEXT: vmovdqa 1024(%rdi), %ymm3
21186 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
21187 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
21188 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
21189 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
21190 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21191 ; AVX512DQ-NEXT: vmovdqa64 3008(%rdi), %zmm1
21192 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21193 ; AVX512DQ-NEXT: vmovdqa64 2944(%rdi), %zmm29
21194 ; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm0
21195 ; AVX512DQ-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21196 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
21197 ; AVX512DQ-NEXT: vmovdqa64 2880(%rdi), %zmm24
21198 ; AVX512DQ-NEXT: vmovdqa64 2816(%rdi), %zmm25
21199 ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm1
21200 ; AVX512DQ-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21201 ; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm2, %zmm1
21202 ; AVX512DQ-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21203 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
21204 ; AVX512DQ-NEXT: vmovdqa64 2752(%rdi), %ymm27
21205 ; AVX512DQ-NEXT: vmovdqa64 2688(%rdi), %ymm26
21206 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm27[0],ymm26[2],ymm27[2]
21207 ; AVX512DQ-NEXT: vmovdqa64 2624(%rdi), %ymm30
21208 ; AVX512DQ-NEXT: vmovdqa64 2560(%rdi), %ymm18
21209 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm30[0],ymm18[2],ymm30[2]
21210 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
21211 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
21212 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21213 ; AVX512DQ-NEXT: vmovdqa64 2496(%rdi), %zmm1
21214 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21215 ; AVX512DQ-NEXT: vmovdqa64 2432(%rdi), %zmm31
21216 ; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm0
21217 ; AVX512DQ-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21218 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
21219 ; AVX512DQ-NEXT: vmovdqa64 2368(%rdi), %zmm3
21220 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21221 ; AVX512DQ-NEXT: vmovdqa64 2304(%rdi), %zmm1
21222 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21223 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
21224 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
21225 ; AVX512DQ-NEXT: vmovdqa 2240(%rdi), %ymm12
21226 ; AVX512DQ-NEXT: vmovdqa 2176(%rdi), %ymm11
21227 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2]
21228 ; AVX512DQ-NEXT: vmovdqa 2112(%rdi), %ymm10
21229 ; AVX512DQ-NEXT: vmovdqa 2048(%rdi), %ymm3
21230 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm10[0],ymm3[2],ymm10[2]
21231 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3]
21232 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
21233 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21234 ; AVX512DQ-NEXT: vmovdqa64 4032(%rdi), %zmm1
21235 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21236 ; AVX512DQ-NEXT: vmovdqa64 3968(%rdi), %zmm0
21237 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21238 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
21239 ; AVX512DQ-NEXT: vmovdqa64 3904(%rdi), %zmm1
21240 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21241 ; AVX512DQ-NEXT: vmovdqa64 3840(%rdi), %zmm4
21242 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21243 ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm4, %zmm2
21244 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
21245 ; AVX512DQ-NEXT: vmovdqa64 3776(%rdi), %ymm17
21246 ; AVX512DQ-NEXT: vmovdqa64 3712(%rdi), %ymm23
21247 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm23[0],ymm17[0],ymm23[2],ymm17[2]
21248 ; AVX512DQ-NEXT: vmovdqa 3648(%rdi), %ymm1
21249 ; AVX512DQ-NEXT: vmovdqa 3584(%rdi), %ymm0
21250 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
21251 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3]
21252 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
21253 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21254 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
21255 ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
21256 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm2, %zmm28
21257 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm8
21258 ; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21259 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm6
21260 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
21261 ; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm2, %zmm6
21262 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm6 {%k1}
21263 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
21264 ; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload
21265 ; AVX512DQ-NEXT: # ymm5 = ymm4[1],mem[1],ymm4[3],mem[3]
21266 ; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
21267 ; AVX512DQ-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
21268 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
21269 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
21270 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21271 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
21272 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm5
21273 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm5
21274 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
21275 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm6
21276 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
21277 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm2, %zmm6
21278 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
21279 ; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload
21280 ; AVX512DQ-NEXT: # ymm5 = ymm20[1],mem[1],ymm20[3],mem[3]
21281 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm19[1],ymm22[1],ymm19[3],ymm22[3]
21282 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
21283 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
21284 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21285 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
21286 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm5
21287 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
21288 ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm2, %zmm5
21289 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
21290 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
21291 ; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm2, %zmm6
21292 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
21293 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
21294 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
21295 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
21296 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
21297 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21298 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
21299 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm5
21300 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
21301 ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm2, %zmm5
21302 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
21303 ; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm6
21304 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
21305 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm2, %zmm6
21306 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
21307 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
21308 ; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
21309 ; AVX512DQ-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
21310 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
21311 ; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
21312 ; AVX512DQ-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
21313 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
21314 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
21315 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21316 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
21317 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload
21318 ; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
21319 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload
21320 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
21321 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
21322 ; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
21323 ; AVX512DQ-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
21324 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
21325 ; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
21326 ; AVX512DQ-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
21327 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
21328 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
21329 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21330 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm29 # 64-byte Folded Reload
21331 ; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm2, %zmm25
21332 ; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1}
21333 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm27[1],ymm26[3],ymm27[3]
21334 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm18[1],ymm30[1],ymm18[3],ymm30[3]
21335 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
21336 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm25, %zmm5
21337 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21338 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
21339 ; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm2, %zmm31
21340 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
21341 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm6
21342 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload
21343 ; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1}
21344 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm12[1],ymm11[3],ymm12[3]
21345 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm10[1],ymm3[3],ymm10[3]
21346 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3]
21347 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3
21348 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21349 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
21350 ; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm3
21351 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
21352 ; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm2, %zmm3
21353 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
21354 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
21355 ; AVX512DQ-NEXT: vpermi2q %zmm11, %zmm12, %zmm2
21356 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1}
21357 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm17[1],ymm23[3],ymm17[3]
21358 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
21359 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
21360 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
21361 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21362 ; AVX512DQ-NEXT: vmovdqa64 3264(%rdi), %zmm3
21363 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21364 ; AVX512DQ-NEXT: vmovdqa64 3200(%rdi), %zmm10
21365 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
21366 ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
21367 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm1
21368 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm1
21369 ; AVX512DQ-NEXT: vmovdqa64 3136(%rdi), %zmm23
21370 ; AVX512DQ-NEXT: vmovdqa64 3072(%rdi), %zmm2
21371 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21372 ; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm0, %zmm2
21373 ; AVX512DQ-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21374 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
21375 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
21376 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm2
21377 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
21378 ; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm0, %zmm2
21379 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6]
21380 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
21381 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
21382 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21383 ; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm2
21384 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21385 ; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm1
21386 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21387 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
21388 ; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm3
21389 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21390 ; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm2
21391 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21392 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
21393 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
21394 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm2
21395 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
21396 ; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm0, %zmm2
21397 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm4[0],zmm9[2],zmm4[2],zmm9[4],zmm4[4],zmm9[6],zmm4[6]
21398 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
21399 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
21400 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21401 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm2
21402 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21403 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm1
21404 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21405 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
21406 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm3
21407 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21408 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2
21409 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21410 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
21411 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
21412 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm2
21413 ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm0, %zmm2
21414 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm16
21415 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
21416 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm22[0],zmm3[2],zmm22[2],zmm3[4],zmm22[4],zmm3[6],zmm22[6]
21417 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
21418 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
21419 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21420 ; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm2
21421 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21422 ; AVX512DQ-NEXT: vmovdqa64 1664(%rdi), %zmm1
21423 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21424 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
21425 ; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm3
21426 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21427 ; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %zmm2
21428 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21429 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
21430 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
21431 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm2
21432 ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
21433 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm13[0],zmm21[2],zmm13[2],zmm21[4],zmm13[4],zmm21[6],zmm13[6]
21434 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
21435 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
21436 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21437 ; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm2
21438 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21439 ; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm1
21440 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21441 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
21442 ; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm3
21443 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21444 ; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm2
21445 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21446 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
21447 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
21448 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
21449 ; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm2
21450 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
21451 ; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload
21452 ; AVX512DQ-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload
21453 ; AVX512DQ-NEXT: # zmm3 = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6]
21454 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
21455 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
21456 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21457 ; AVX512DQ-NEXT: vmovdqa64 2752(%rdi), %zmm3
21458 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21459 ; AVX512DQ-NEXT: vmovdqa64 2688(%rdi), %zmm8
21460 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm1
21461 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21462 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm1
21463 ; AVX512DQ-NEXT: vmovdqa64 2624(%rdi), %zmm3
21464 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21465 ; AVX512DQ-NEXT: vmovdqa64 2560(%rdi), %zmm2
21466 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21467 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
21468 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
21469 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
21470 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm2
21471 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
21472 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
21473 ; AVX512DQ-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload
21474 ; AVX512DQ-NEXT: # zmm3 = zmm7[0],mem[0],zmm7[2],mem[2],zmm7[4],mem[4],zmm7[6],mem[6]
21475 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
21476 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
21477 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21478 ; AVX512DQ-NEXT: vmovdqa64 2240(%rdi), %zmm2
21479 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21480 ; AVX512DQ-NEXT: vmovdqa64 2176(%rdi), %zmm1
21481 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21482 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
21483 ; AVX512DQ-NEXT: vmovdqa64 2112(%rdi), %zmm3
21484 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21485 ; AVX512DQ-NEXT: vmovdqa64 2048(%rdi), %zmm2
21486 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21487 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
21488 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
21489 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
21490 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm2
21491 ; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
21492 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
21493 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm26[0],zmm4[0],zmm26[2],zmm4[2],zmm26[4],zmm4[4],zmm26[6],zmm4[6]
21494 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
21495 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
21496 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21497 ; AVX512DQ-NEXT: vmovdqa64 3776(%rdi), %zmm2
21498 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21499 ; AVX512DQ-NEXT: vmovdqa64 3712(%rdi), %zmm1
21500 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21501 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
21502 ; AVX512DQ-NEXT: vmovdqa64 3648(%rdi), %zmm3
21503 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21504 ; AVX512DQ-NEXT: vmovdqa64 3584(%rdi), %zmm2
21505 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21506 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
21507 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
21508 ; AVX512DQ-NEXT: vpermi2q %zmm18, %zmm29, %zmm0
21509 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm12[0],zmm11[0],zmm12[2],zmm11[2],zmm12[4],zmm11[4],zmm12[6],zmm11[6]
21510 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
21511 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0
21512 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21513 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
21514 ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
21515 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21516 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
21517 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm0, %zmm10
21518 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
21519 ; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm0, %zmm2
21520 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7]
21521 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm2
21522 ; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm0, %zmm2
21523 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
21524 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
21525 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
21526 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
21527 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
21528 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21529 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21530 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
21531 ; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm0, %zmm1
21532 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
21533 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
21534 ; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
21535 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
21536 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm29
21537 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm2
21538 ; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm0, %zmm2
21539 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
21540 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
21541 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7]
21542 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
21543 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
21544 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21545 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21546 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
21547 ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm0, %zmm1
21548 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
21549 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
21550 ; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm0, %zmm2
21551 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
21552 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
21553 ; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
21554 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
21555 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
21556 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm12[1],zmm18[1],zmm12[3],zmm18[3],zmm12[5],zmm18[5],zmm12[7],zmm18[7]
21557 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
21558 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
21559 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21560 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21561 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
21562 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
21563 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
21564 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
21565 ; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
21566 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
21567 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm2
21568 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
21569 ; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm0, %zmm2
21570 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
21571 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm13[1],zmm3[3],zmm13[3],zmm3[5],zmm13[5],zmm3[7],zmm13[7]
21572 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
21573 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
21574 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21575 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21576 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
21577 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm0, %zmm1
21578 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
21579 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
21580 ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm0, %zmm2
21581 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
21582 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload
21583 ; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload
21584 ; AVX512DQ-NEXT: # zmm3 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7]
21585 ; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1}
21586 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
21587 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21588 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
21589 ; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm0, %zmm8
21590 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
21591 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
21592 ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
21593 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm8[4,5,6,7]
21594 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload
21595 ; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload
21596 ; AVX512DQ-NEXT: # zmm3 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7]
21597 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1}
21598 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
21599 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21600 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21601 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
21602 ; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm0, %zmm1
21603 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
21604 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm2
21605 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
21606 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm0, %zmm2
21607 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
21608 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
21609 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
21610 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7]
21611 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1}
21612 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
21613 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21614 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
21615 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1
21616 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
21617 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm0, %zmm1
21618 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
21619 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm2
21620 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
21621 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm0, %zmm2
21622 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
21623 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
21624 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
21625 ; AVX512DQ-NEXT: vpermi2q %zmm10, %zmm9, %zmm0
21626 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
21627 ; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload
21628 ; AVX512DQ-NEXT: # zmm2 = zmm2[1],mem[1],zmm2[3],mem[3],zmm2[5],mem[5],zmm2[7],mem[7]
21629 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
21630 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0
21631 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21632 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
21633 ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
21634 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21635 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm0, %zmm1
21636 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
21637 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
21638 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
21639 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm2
21640 ; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm0, %zmm2
21641 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
21642 ; AVX512DQ-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 {%k1} # 64-byte Folded Reload
21643 ; AVX512DQ-NEXT: # zmm2 {%k1} = zmm25[0],mem[0],zmm25[2],mem[2],zmm25[4],mem[4],zmm25[6],mem[6]
21644 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
21645 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21646 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21647 ; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm0, %zmm1
21648 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
21649 ; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
21650 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
21651 ; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm0, %zmm15
21652 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm29[0],zmm30[0],zmm29[2],zmm30[2],zmm29[4],zmm30[4],zmm29[6],zmm30[6]
21653 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1
21654 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21655 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21656 ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm0, %zmm1
21657 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
21658 ; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm0, %zmm2
21659 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
21660 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm2
21661 ; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm0, %zmm2
21662 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
21663 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
21664 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm30[0],zmm26[0],zmm30[2],zmm26[2],zmm30[4],zmm26[4],zmm30[6],zmm26[6]
21665 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
21666 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21667 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21668 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
21669 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
21670 ; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
21671 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
21672 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
21673 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm2
21674 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
21675 ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm0, %zmm2
21676 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
21677 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm24[0],zmm23[0],zmm24[2],zmm23[2],zmm24[4],zmm23[4],zmm24[6],zmm23[6]
21678 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
21679 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21680 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21681 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm0, %zmm1
21682 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
21683 ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm0, %zmm2
21684 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
21685 ; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
21686 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
21687 ; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm0, %zmm2
21688 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
21689 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
21690 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
21691 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
21692 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21693 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21694 ; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm0, %zmm1
21695 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
21696 ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
21697 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
21698 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
21699 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
21700 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
21701 ; AVX512DQ-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm2 {%k1} # 64-byte Folded Reload
21702 ; AVX512DQ-NEXT: # zmm2 {%k1} = zmm11[0],mem[0],zmm11[2],mem[2],zmm11[4],mem[4],zmm11[6],mem[6]
21703 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
21704 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21705 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21706 ; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm0, %zmm1
21707 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm0, %zmm7
21708 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
21709 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
21710 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm2
21711 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
21712 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
21713 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
21714 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm28[0],zmm7[2],zmm28[2],zmm7[4],zmm28[4],zmm7[6],zmm28[6]
21715 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
21716 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21717 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm0, %zmm3
21718 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm0, %zmm4
21719 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7]
21720 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
21721 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
21722 ; AVX512DQ-NEXT: vpermi2q %zmm25, %zmm15, %zmm0
21723 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6]
21724 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
21725 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21726 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm1
21727 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm0
21728 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15]
21729 ; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
21730 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
21731 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm5, %zmm19
21732 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
21733 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
21734 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7]
21735 ; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21736 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm6
21737 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8]
21738 ; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
21739 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm12, %zmm17
21740 ; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21741 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9]
21742 ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
21743 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm2, %zmm6
21744 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21745 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm12, %zmm1
21746 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21747 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm0
21748 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21749 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21750 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm20
21751 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0
21752 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
21753 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm5, %zmm1
21754 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
21755 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
21756 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm11[1],zmm13[3],zmm11[3],zmm13[5],zmm11[5],zmm13[7],zmm11[7]
21757 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21758 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm1
21759 ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm12, %zmm13
21760 ; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21761 ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm2, %zmm1
21762 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21763 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm12, %zmm20
21764 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm2, %zmm0
21765 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21766 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21767 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm17
21768 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0
21769 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
21770 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm5, %zmm1
21771 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm10
21772 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm30[1],zmm26[1],zmm30[3],zmm26[3],zmm30[5],zmm26[5],zmm30[7],zmm26[7]
21773 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21774 ; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm1
21775 ; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm26
21776 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm12, %zmm26
21777 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm2, %zmm1
21778 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21779 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm12, %zmm17
21780 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm0
21781 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21782 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm19
21783 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm31
21784 ; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm3
21785 ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm5, %zmm16
21786 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
21787 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm24[1],zmm6[1],zmm24[3],zmm6[3],zmm24[5],zmm6[5],zmm24[7],zmm6[7]
21788 ; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21789 ; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm0
21790 ; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm29
21791 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm12, %zmm29
21792 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm2, %zmm0
21793 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21794 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm12, %zmm19
21795 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm31
21796 ; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
21797 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm16
21798 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm27
21799 ; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm5, %zmm1
21800 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7]
21801 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
21802 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm1
21803 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm24
21804 ; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm12, %zmm24
21805 ; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm2, %zmm1
21806 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21807 ; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm12, %zmm16
21808 ; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm2, %zmm27
21809 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21810 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm18
21811 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm30
21812 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
21813 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm5, %zmm1
21814 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21815 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
21816 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm10[1],zmm0[1],zmm10[3],zmm0[3],zmm10[5],zmm0[5],zmm10[7],zmm0[7]
21817 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21818 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm1
21819 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm12, %zmm10
21820 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21821 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
21822 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21823 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm12, %zmm18
21824 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm2, %zmm30
21825 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm6
21826 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm23
21827 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
21828 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm5, %zmm6
21829 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm0
21830 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
21831 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm21[1],zmm28[1],zmm21[3],zmm28[3],zmm21[5],zmm28[5],zmm21[7],zmm28[7]
21832 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm28
21833 ; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm1
21834 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm12, %zmm21
21835 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
21836 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21837 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm12, %zmm23
21838 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm2, %zmm22
21839 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
21840 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm0
21841 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
21842 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm12, %zmm0
21843 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21844 ; AVX512DQ-NEXT: vpermi2q %zmm25, %zmm15, %zmm12
21845 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm3
21846 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm2, %zmm3
21847 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21848 ; AVX512DQ-NEXT: vpermi2q %zmm25, %zmm15, %zmm2
21849 ; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm5, %zmm15
21850 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm14[1],zmm8[1],zmm14[3],zmm8[3],zmm14[5],zmm8[5],zmm14[7],zmm8[7]
21851 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm25
21852 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
21853 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload
21854 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21855 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload
21856 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
21857 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload
21858 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21859 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload
21860 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
21861 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 # 64-byte Folded Reload
21862 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
21863 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload
21864 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
21865 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload
21866 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
21867 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload
21868 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
21869 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload
21870 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
21871 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload
21872 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
21873 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload
21874 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
21875 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
21876 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21877 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
21878 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
21879 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21880 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
21881 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
21882 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21883 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
21884 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
21885 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
21886 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
21887 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
21888 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
21889 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
21890 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21891 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm8[4,5,6,7]
21892 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21893 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
21894 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21895 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7]
21896 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21897 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
21898 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21899 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5,6,7]
21900 ; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
21901 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
21902 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
21903 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5,6,7]
21904 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21905 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
21906 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21907 ; AVX512DQ-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload
21908 ; AVX512DQ-NEXT: # ymm0 = mem[0,1,2,3],ymm15[4,5,6,7]
21909 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0
21910 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21911 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21912 ; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
21913 ; AVX512DQ-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
21914 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm25
21915 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm10[4,5,6,7]
21916 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21917 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
21918 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21919 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1}
21920 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm8
21921 ; AVX512DQ-NEXT: vinserti128 $1, 192(%rdi), %ymm8, %ymm8
21922 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
21923 ; AVX512DQ-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0
21924 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm0[0],ymm8[0],ymm0[2],ymm8[2]
21925 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm28, %zmm17, %zmm26
21926 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21927 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1}
21928 ; AVX512DQ-NEXT: vmovdqa 576(%rdi), %xmm13
21929 ; AVX512DQ-NEXT: vinserti128 $1, 704(%rdi), %ymm13, %ymm13
21930 ; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %xmm28
21931 ; AVX512DQ-NEXT: vinserti32x4 $1, 640(%rdi), %ymm28, %ymm28
21932 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm28[0],ymm13[0],ymm28[2],ymm13[2]
21933 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm1
21934 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21935 ; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1}
21936 ; AVX512DQ-NEXT: vmovdqa 1088(%rdi), %xmm4
21937 ; AVX512DQ-NEXT: vinserti128 $1, 1216(%rdi), %ymm4, %ymm4
21938 ; AVX512DQ-NEXT: vmovdqa 1024(%rdi), %xmm7
21939 ; AVX512DQ-NEXT: vinserti128 $1, 1152(%rdi), %ymm7, %ymm7
21940 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm7[0],ymm4[0],ymm7[2],ymm4[2]
21941 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm24
21942 ; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm19 {%k1}
21943 ; AVX512DQ-NEXT: vmovdqa 1600(%rdi), %xmm5
21944 ; AVX512DQ-NEXT: vinserti128 $1, 1728(%rdi), %ymm5, %ymm5
21945 ; AVX512DQ-NEXT: vmovdqa 1536(%rdi), %xmm11
21946 ; AVX512DQ-NEXT: vinserti128 $1, 1664(%rdi), %ymm11, %ymm11
21947 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm11[0],ymm5[0],ymm11[2],ymm5[2]
21948 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm29, %zmm19, %zmm19
21949 ; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1}
21950 ; AVX512DQ-NEXT: vmovdqa 2112(%rdi), %xmm6
21951 ; AVX512DQ-NEXT: vinserti128 $1, 2240(%rdi), %ymm6, %ymm6
21952 ; AVX512DQ-NEXT: vmovdqa64 2048(%rdi), %xmm29
21953 ; AVX512DQ-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm29, %ymm29
21954 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm29[0],ymm6[0],ymm29[2],ymm6[2]
21955 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3
21956 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21957 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1}
21958 ; AVX512DQ-NEXT: vmovdqa 2624(%rdi), %xmm10
21959 ; AVX512DQ-NEXT: vinserti128 $1, 2752(%rdi), %ymm10, %ymm10
21960 ; AVX512DQ-NEXT: vmovdqa64 2560(%rdi), %xmm23
21961 ; AVX512DQ-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm23, %ymm23
21962 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm23[0],ymm10[0],ymm23[2],ymm10[2]
21963 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm20, %zmm18, %zmm18
21964 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21965 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
21966 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1}
21967 ; AVX512DQ-NEXT: vmovdqa64 3136(%rdi), %xmm17
21968 ; AVX512DQ-NEXT: vinserti32x4 $1, 3264(%rdi), %ymm17, %ymm17
21969 ; AVX512DQ-NEXT: vmovdqa64 3072(%rdi), %xmm20
21970 ; AVX512DQ-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20
21971 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm20[0],ymm17[0],ymm20[2],ymm17[2]
21972 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm15, %zmm1, %zmm15
21973 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21974 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1}
21975 ; AVX512DQ-NEXT: vmovdqa 3648(%rdi), %xmm9
21976 ; AVX512DQ-NEXT: vinserti128 $1, 3776(%rdi), %ymm9, %ymm9
21977 ; AVX512DQ-NEXT: vmovdqa 3584(%rdi), %xmm14
21978 ; AVX512DQ-NEXT: vinserti128 $1, 3712(%rdi), %ymm14, %ymm14
21979 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm14[0],ymm9[0],ymm14[2],ymm9[2]
21980 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm1
21981 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
21982 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
21983 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm12 {%k1}
21984 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm20[1],ymm17[1],ymm20[3],ymm17[3]
21985 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm16
21986 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
21987 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
21988 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1}
21989 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm28[1],ymm13[1],ymm28[3],ymm13[3]
21990 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm13, %zmm12, %zmm13
21991 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
21992 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
21993 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1}
21994 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3]
21995 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0
21996 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
21997 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm31 {%k1}
21998 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3]
21999 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm5
22000 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
22001 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1}
22002 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],ymm4[1],ymm7[3],ymm4[3]
22003 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4
22004 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
22005 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1}
22006 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm10[1],ymm23[3],ymm10[3]
22007 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm30, %zmm7
22008 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
22009 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1}
22010 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm29[1],ymm6[1],ymm29[3],ymm6[3]
22011 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6
22012 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
22013 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1}
22014 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm14[1],ymm9[1],ymm14[3],ymm9[3]
22015 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2
22016 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 448(%rsi)
22017 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, 384(%rsi)
22018 ; AVX512DQ-NEXT: vmovdqa64 %zmm18, 320(%rsi)
22019 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 256(%rsi)
22020 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, 192(%rsi)
22021 ; AVX512DQ-NEXT: vmovdqa64 %zmm24, 128(%rsi)
22022 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22023 ; AVX512DQ-NEXT: vmovaps %zmm1, 64(%rsi)
22024 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, (%rsi)
22025 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 448(%rdx)
22026 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 256(%rdx)
22027 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 320(%rdx)
22028 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 128(%rdx)
22029 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 192(%rdx)
22030 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
22031 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 64(%rdx)
22032 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 384(%rdx)
22033 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22034 ; AVX512DQ-NEXT: vmovaps %zmm0, 448(%rcx)
22035 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22036 ; AVX512DQ-NEXT: vmovaps %zmm0, 256(%rcx)
22037 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22038 ; AVX512DQ-NEXT: vmovaps %zmm0, 320(%rcx)
22039 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22040 ; AVX512DQ-NEXT: vmovaps %zmm0, 128(%rcx)
22041 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22042 ; AVX512DQ-NEXT: vmovaps %zmm0, 192(%rcx)
22043 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22044 ; AVX512DQ-NEXT: vmovaps %zmm0, (%rcx)
22045 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22046 ; AVX512DQ-NEXT: vmovaps %zmm0, 64(%rcx)
22047 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22048 ; AVX512DQ-NEXT: vmovaps %zmm0, 384(%rcx)
22049 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22050 ; AVX512DQ-NEXT: vmovaps %zmm0, 448(%r8)
22051 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22052 ; AVX512DQ-NEXT: vmovaps %zmm0, 256(%r8)
22053 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22054 ; AVX512DQ-NEXT: vmovaps %zmm0, 320(%r8)
22055 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22056 ; AVX512DQ-NEXT: vmovaps %zmm0, 128(%r8)
22057 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22058 ; AVX512DQ-NEXT: vmovaps %zmm0, 192(%r8)
22059 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22060 ; AVX512DQ-NEXT: vmovaps %zmm0, (%r8)
22061 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22062 ; AVX512DQ-NEXT: vmovaps %zmm0, 64(%r8)
22063 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22064 ; AVX512DQ-NEXT: vmovaps %zmm0, 384(%r8)
22065 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22066 ; AVX512DQ-NEXT: vmovaps %zmm0, 448(%r9)
22067 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22068 ; AVX512DQ-NEXT: vmovaps %zmm0, 256(%r9)
22069 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22070 ; AVX512DQ-NEXT: vmovaps %zmm0, 320(%r9)
22071 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22072 ; AVX512DQ-NEXT: vmovaps %zmm0, 128(%r9)
22073 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22074 ; AVX512DQ-NEXT: vmovaps %zmm0, 192(%r9)
22075 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22076 ; AVX512DQ-NEXT: vmovaps %zmm0, (%r9)
22077 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22078 ; AVX512DQ-NEXT: vmovaps %zmm0, 64(%r9)
22079 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22080 ; AVX512DQ-NEXT: vmovaps %zmm0, 384(%r9)
22081 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
22082 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22083 ; AVX512DQ-NEXT: vmovaps %zmm0, 448(%rax)
22084 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22085 ; AVX512DQ-NEXT: vmovaps %zmm0, 256(%rax)
22086 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22087 ; AVX512DQ-NEXT: vmovaps %zmm0, 320(%rax)
22088 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22089 ; AVX512DQ-NEXT: vmovaps %zmm0, 128(%rax)
22090 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22091 ; AVX512DQ-NEXT: vmovaps %zmm0, 192(%rax)
22092 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22093 ; AVX512DQ-NEXT: vmovaps %zmm0, (%rax)
22094 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22095 ; AVX512DQ-NEXT: vmovaps %zmm0, 64(%rax)
22096 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22097 ; AVX512DQ-NEXT: vmovaps %zmm0, 384(%rax)
22098 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
22099 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22100 ; AVX512DQ-NEXT: vmovaps %zmm0, 448(%rax)
22101 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22102 ; AVX512DQ-NEXT: vmovaps %zmm0, 256(%rax)
22103 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22104 ; AVX512DQ-NEXT: vmovaps %zmm0, 320(%rax)
22105 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22106 ; AVX512DQ-NEXT: vmovaps %zmm0, 128(%rax)
22107 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22108 ; AVX512DQ-NEXT: vmovaps %zmm0, 192(%rax)
22109 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22110 ; AVX512DQ-NEXT: vmovaps %zmm0, (%rax)
22111 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22112 ; AVX512DQ-NEXT: vmovaps %zmm0, 64(%rax)
22113 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22114 ; AVX512DQ-NEXT: vmovaps %zmm0, 384(%rax)
22115 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
22116 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22117 ; AVX512DQ-NEXT: vmovaps %zmm0, 384(%rax)
22118 ; AVX512DQ-NEXT: vmovdqa64 %zmm25, 448(%rax)
22119 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22120 ; AVX512DQ-NEXT: vmovaps %zmm0, 256(%rax)
22121 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22122 ; AVX512DQ-NEXT: vmovaps %zmm0, 320(%rax)
22123 ; AVX512DQ-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
22124 ; AVX512DQ-NEXT: vmovaps %zmm0, 128(%rax)
22125 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22126 ; AVX512DQ-NEXT: vmovaps %zmm0, 192(%rax)
22127 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22128 ; AVX512DQ-NEXT: vmovaps %zmm0, (%rax)
22129 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22130 ; AVX512DQ-NEXT: vmovaps %zmm0, 64(%rax)
22131 ; AVX512DQ-NEXT: addq $6664, %rsp # imm = 0x1A08
22132 ; AVX512DQ-NEXT: vzeroupper
22133 ; AVX512DQ-NEXT: retq
22135 ; AVX512DQ-FCP-LABEL: load_i64_stride8_vf64:
22136 ; AVX512DQ-FCP: # %bb.0:
22137 ; AVX512DQ-FCP-NEXT: subq $6664, %rsp # imm = 0x1A08
22138 ; AVX512DQ-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm3
22139 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22140 ; AVX512DQ-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm16
22141 ; AVX512DQ-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm8
22142 ; AVX512DQ-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm28
22143 ; AVX512DQ-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm10
22144 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22145 ; AVX512DQ-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm11
22146 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22147 ; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4
22148 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22149 ; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5
22150 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22151 ; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm6
22152 ; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm9
22153 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22154 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm12
22155 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22156 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15
22157 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22158 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm13
22159 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22160 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14
22161 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22162 ; AVX512DQ-FCP-NEXT: movb $-64, %al
22163 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
22164 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
22165 ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
22166 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
22167 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22168 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm0
22169 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22170 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm1
22171 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
22172 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
22173 ; AVX512DQ-FCP-NEXT: vmovdqa 3264(%rdi), %ymm3
22174 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
22175 ; AVX512DQ-FCP-NEXT: vmovdqa 3200(%rdi), %ymm0
22176 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
22177 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
22178 ; AVX512DQ-FCP-NEXT: vmovdqa 3136(%rdi), %ymm3
22179 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
22180 ; AVX512DQ-FCP-NEXT: vmovdqa 3072(%rdi), %ymm7
22181 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2]
22182 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
22183 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
22184 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22185 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm0
22186 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0
22187 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm9
22188 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22189 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1
22190 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm1
22191 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
22192 ; AVX512DQ-FCP-NEXT: vmovdqa 704(%rdi), %ymm0
22193 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
22194 ; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %ymm20
22195 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm20[0],ymm0[0],ymm20[2],ymm0[2]
22196 ; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %ymm22
22197 ; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %ymm19
22198 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2]
22199 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
22200 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
22201 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22202 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm0
22203 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm0
22204 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm1
22205 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm1
22206 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
22207 ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm14
22208 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm15
22209 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
22210 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm21
22211 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm13
22212 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2]
22213 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
22214 ; AVX512DQ-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3
22215 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22216 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
22217 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22218 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0
22219 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm0
22220 ; AVX512DQ-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm1
22221 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22222 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm1
22223 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
22224 ; AVX512DQ-FCP-NEXT: vmovdqa 1728(%rdi), %ymm3
22225 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
22226 ; AVX512DQ-FCP-NEXT: vmovdqa 1664(%rdi), %ymm0
22227 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
22228 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
22229 ; AVX512DQ-FCP-NEXT: vmovdqa 1600(%rdi), %ymm4
22230 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
22231 ; AVX512DQ-FCP-NEXT: vmovdqa 1536(%rdi), %ymm3
22232 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
22233 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
22234 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
22235 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
22236 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22237 ; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm1
22238 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22239 ; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0
22240 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22241 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
22242 ; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm3
22243 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22244 ; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm1
22245 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
22246 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
22247 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
22248 ; AVX512DQ-FCP-NEXT: vmovdqa 1216(%rdi), %ymm3
22249 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
22250 ; AVX512DQ-FCP-NEXT: vmovdqa 1152(%rdi), %ymm0
22251 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
22252 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
22253 ; AVX512DQ-FCP-NEXT: vmovdqa 1088(%rdi), %ymm4
22254 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
22255 ; AVX512DQ-FCP-NEXT: vmovdqa 1024(%rdi), %ymm3
22256 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
22257 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
22258 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
22259 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
22260 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22261 ; AVX512DQ-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm1
22262 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22263 ; AVX512DQ-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm29
22264 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm0
22265 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22266 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
22267 ; AVX512DQ-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm24
22268 ; AVX512DQ-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm25
22269 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm1
22270 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22271 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm1
22272 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22273 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
22274 ; AVX512DQ-FCP-NEXT: vmovdqa64 2752(%rdi), %ymm27
22275 ; AVX512DQ-FCP-NEXT: vmovdqa64 2688(%rdi), %ymm26
22276 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm27[0],ymm26[2],ymm27[2]
22277 ; AVX512DQ-FCP-NEXT: vmovdqa64 2624(%rdi), %ymm30
22278 ; AVX512DQ-FCP-NEXT: vmovdqa64 2560(%rdi), %ymm18
22279 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm30[0],ymm18[2],ymm30[2]
22280 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
22281 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
22282 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22283 ; AVX512DQ-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm1
22284 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22285 ; AVX512DQ-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm31
22286 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm0
22287 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22288 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
22289 ; AVX512DQ-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm3
22290 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22291 ; AVX512DQ-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm1
22292 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22293 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
22294 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
22295 ; AVX512DQ-FCP-NEXT: vmovdqa 2240(%rdi), %ymm12
22296 ; AVX512DQ-FCP-NEXT: vmovdqa 2176(%rdi), %ymm11
22297 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2]
22298 ; AVX512DQ-FCP-NEXT: vmovdqa 2112(%rdi), %ymm10
22299 ; AVX512DQ-FCP-NEXT: vmovdqa 2048(%rdi), %ymm3
22300 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm10[0],ymm3[2],ymm10[2]
22301 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3]
22302 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
22303 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22304 ; AVX512DQ-FCP-NEXT: vmovdqa64 4032(%rdi), %zmm1
22305 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22306 ; AVX512DQ-FCP-NEXT: vmovdqa64 3968(%rdi), %zmm0
22307 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22308 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
22309 ; AVX512DQ-FCP-NEXT: vmovdqa64 3904(%rdi), %zmm1
22310 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22311 ; AVX512DQ-FCP-NEXT: vmovdqa64 3840(%rdi), %zmm4
22312 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22313 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm2
22314 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
22315 ; AVX512DQ-FCP-NEXT: vmovdqa64 3776(%rdi), %ymm17
22316 ; AVX512DQ-FCP-NEXT: vmovdqa64 3712(%rdi), %ymm23
22317 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm23[0],ymm17[0],ymm23[2],ymm17[2]
22318 ; AVX512DQ-FCP-NEXT: vmovdqa 3648(%rdi), %ymm1
22319 ; AVX512DQ-FCP-NEXT: vmovdqa 3584(%rdi), %ymm0
22320 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
22321 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3]
22322 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
22323 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22324 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
22325 ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
22326 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm28
22327 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm8
22328 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22329 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm6
22330 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
22331 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm6
22332 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm6 {%k1}
22333 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
22334 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload
22335 ; AVX512DQ-FCP-NEXT: # ymm5 = ymm4[1],mem[1],ymm4[3],mem[3]
22336 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
22337 ; AVX512DQ-FCP-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
22338 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
22339 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
22340 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22341 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
22342 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm5
22343 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm5
22344 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
22345 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm6
22346 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
22347 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm6
22348 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
22349 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload
22350 ; AVX512DQ-FCP-NEXT: # ymm5 = ymm20[1],mem[1],ymm20[3],mem[3]
22351 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm19[1],ymm22[1],ymm19[3],ymm22[3]
22352 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
22353 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
22354 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22355 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
22356 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm5
22357 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
22358 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm5
22359 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
22360 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
22361 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm2, %zmm6
22362 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
22363 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
22364 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
22365 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
22366 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
22367 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22368 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
22369 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm5
22370 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
22371 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm5
22372 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
22373 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm6
22374 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
22375 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm6
22376 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
22377 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
22378 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
22379 ; AVX512DQ-FCP-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
22380 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
22381 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
22382 ; AVX512DQ-FCP-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
22383 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
22384 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
22385 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22386 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
22387 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload
22388 ; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
22389 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload
22390 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
22391 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
22392 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
22393 ; AVX512DQ-FCP-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
22394 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
22395 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
22396 ; AVX512DQ-FCP-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
22397 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
22398 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
22399 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22400 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm29 # 64-byte Folded Reload
22401 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm25
22402 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1}
22403 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm27[1],ymm26[3],ymm27[3]
22404 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm18[1],ymm30[1],ymm18[3],ymm30[3]
22405 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
22406 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm25, %zmm5
22407 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22408 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
22409 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm31
22410 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
22411 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm6
22412 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload
22413 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1}
22414 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm12[1],ymm11[3],ymm12[3]
22415 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm10[1],ymm3[3],ymm10[3]
22416 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3]
22417 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3
22418 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22419 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
22420 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm3
22421 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
22422 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm3
22423 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
22424 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
22425 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm11, %zmm12, %zmm2
22426 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1}
22427 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm17[1],ymm23[3],ymm17[3]
22428 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
22429 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
22430 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
22431 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22432 ; AVX512DQ-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm3
22433 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22434 ; AVX512DQ-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm10
22435 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
22436 ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
22437 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm1
22438 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1
22439 ; AVX512DQ-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm23
22440 ; AVX512DQ-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm2
22441 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22442 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2
22443 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22444 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
22445 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
22446 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm2
22447 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
22448 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2
22449 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6]
22450 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
22451 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
22452 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22453 ; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm2
22454 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22455 ; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm1
22456 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22457 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
22458 ; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm3
22459 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22460 ; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2
22461 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22462 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
22463 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
22464 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm2
22465 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
22466 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm2
22467 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm4[0],zmm9[2],zmm4[2],zmm9[4],zmm4[4],zmm9[6],zmm4[6]
22468 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
22469 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
22470 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22471 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2
22472 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22473 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
22474 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22475 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
22476 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3
22477 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22478 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
22479 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22480 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
22481 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
22482 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm2
22483 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2
22484 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm16
22485 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
22486 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm22[0],zmm3[2],zmm22[2],zmm3[4],zmm22[4],zmm3[6],zmm22[6]
22487 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
22488 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
22489 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22490 ; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2
22491 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22492 ; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm1
22493 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22494 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
22495 ; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm3
22496 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22497 ; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm2
22498 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22499 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
22500 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
22501 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm2
22502 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
22503 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm13[0],zmm21[2],zmm13[2],zmm21[4],zmm13[4],zmm21[6],zmm13[6]
22504 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
22505 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
22506 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22507 ; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm2
22508 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22509 ; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1
22510 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22511 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
22512 ; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm3
22513 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22514 ; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm2
22515 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22516 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
22517 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
22518 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
22519 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm2
22520 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
22521 ; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload
22522 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload
22523 ; AVX512DQ-FCP-NEXT: # zmm3 = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6]
22524 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
22525 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
22526 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22527 ; AVX512DQ-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm3
22528 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22529 ; AVX512DQ-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm8
22530 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
22531 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22532 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1
22533 ; AVX512DQ-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm3
22534 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22535 ; AVX512DQ-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm2
22536 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22537 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
22538 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
22539 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
22540 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm2
22541 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
22542 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
22543 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload
22544 ; AVX512DQ-FCP-NEXT: # zmm3 = zmm7[0],mem[0],zmm7[2],mem[2],zmm7[4],mem[4],zmm7[6],mem[6]
22545 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
22546 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
22547 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22548 ; AVX512DQ-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm2
22549 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22550 ; AVX512DQ-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm1
22551 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22552 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
22553 ; AVX512DQ-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm3
22554 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22555 ; AVX512DQ-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm2
22556 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22557 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
22558 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
22559 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
22560 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm2
22561 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
22562 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
22563 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm26[0],zmm4[0],zmm26[2],zmm4[2],zmm26[4],zmm4[4],zmm26[6],zmm4[6]
22564 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
22565 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
22566 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22567 ; AVX512DQ-FCP-NEXT: vmovdqa64 3776(%rdi), %zmm2
22568 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22569 ; AVX512DQ-FCP-NEXT: vmovdqa64 3712(%rdi), %zmm1
22570 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22571 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
22572 ; AVX512DQ-FCP-NEXT: vmovdqa64 3648(%rdi), %zmm3
22573 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22574 ; AVX512DQ-FCP-NEXT: vmovdqa64 3584(%rdi), %zmm2
22575 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22576 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
22577 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
22578 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm18, %zmm29, %zmm0
22579 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm12[0],zmm11[0],zmm12[2],zmm11[2],zmm12[4],zmm11[4],zmm12[6],zmm11[6]
22580 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
22581 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0
22582 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22583 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
22584 ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
22585 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22586 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
22587 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm10
22588 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
22589 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2
22590 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7]
22591 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm2
22592 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2
22593 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
22594 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
22595 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
22596 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
22597 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
22598 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22599 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22600 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
22601 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm1
22602 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
22603 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
22604 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
22605 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
22606 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm29
22607 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm2
22608 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm2
22609 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
22610 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
22611 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7]
22612 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
22613 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
22614 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22615 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22616 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
22617 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1
22618 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
22619 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
22620 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2
22621 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
22622 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
22623 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
22624 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
22625 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
22626 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm12[1],zmm18[1],zmm12[3],zmm18[3],zmm12[5],zmm18[5],zmm12[7],zmm18[7]
22627 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
22628 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
22629 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22630 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22631 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
22632 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
22633 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
22634 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
22635 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
22636 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
22637 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm2
22638 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
22639 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2
22640 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
22641 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm13[1],zmm3[3],zmm13[3],zmm3[5],zmm13[5],zmm3[7],zmm13[7]
22642 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
22643 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
22644 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22645 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22646 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
22647 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1
22648 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
22649 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
22650 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm2
22651 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
22652 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload
22653 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload
22654 ; AVX512DQ-FCP-NEXT: # zmm3 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7]
22655 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1}
22656 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
22657 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22658 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
22659 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm8
22660 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
22661 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
22662 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
22663 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm8[4,5,6,7]
22664 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload
22665 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload
22666 ; AVX512DQ-FCP-NEXT: # zmm3 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7]
22667 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1}
22668 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
22669 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22670 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22671 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
22672 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm1
22673 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
22674 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2
22675 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
22676 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm2
22677 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
22678 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
22679 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
22680 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7]
22681 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1}
22682 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
22683 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22684 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
22685 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1
22686 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
22687 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm1
22688 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
22689 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm2
22690 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
22691 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2
22692 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
22693 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
22694 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
22695 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm10, %zmm9, %zmm0
22696 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
22697 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload
22698 ; AVX512DQ-FCP-NEXT: # zmm2 = zmm2[1],mem[1],zmm2[3],mem[3],zmm2[5],mem[5],zmm2[7],mem[7]
22699 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
22700 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0
22701 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22702 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
22703 ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
22704 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22705 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm1
22706 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
22707 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
22708 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
22709 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm2
22710 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2
22711 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
22712 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 {%k1} # 64-byte Folded Reload
22713 ; AVX512DQ-FCP-NEXT: # zmm2 {%k1} = zmm25[0],mem[0],zmm25[2],mem[2],zmm25[4],mem[4],zmm25[6],mem[6]
22714 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
22715 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22716 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22717 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm1
22718 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
22719 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
22720 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
22721 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm0, %zmm15
22722 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm29[0],zmm30[0],zmm29[2],zmm30[2],zmm29[4],zmm30[4],zmm29[6],zmm30[6]
22723 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1
22724 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22725 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22726 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1
22727 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
22728 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2
22729 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
22730 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm2
22731 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2
22732 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
22733 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
22734 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm30[0],zmm26[0],zmm30[2],zmm26[2],zmm30[4],zmm26[4],zmm30[6],zmm26[6]
22735 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
22736 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22737 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22738 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
22739 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
22740 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
22741 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
22742 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
22743 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm2
22744 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
22745 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm2
22746 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
22747 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm24[0],zmm23[0],zmm24[2],zmm23[2],zmm24[4],zmm23[4],zmm24[6],zmm23[6]
22748 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
22749 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22750 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22751 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1
22752 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
22753 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm2
22754 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
22755 ; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
22756 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
22757 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2
22758 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
22759 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
22760 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
22761 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
22762 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22763 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22764 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm1
22765 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
22766 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
22767 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
22768 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
22769 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
22770 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
22771 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm2 {%k1} # 64-byte Folded Reload
22772 ; AVX512DQ-FCP-NEXT: # zmm2 {%k1} = zmm11[0],mem[0],zmm11[2],mem[2],zmm11[4],mem[4],zmm11[6],mem[6]
22773 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
22774 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22775 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22776 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm1
22777 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm7
22778 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
22779 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
22780 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm2
22781 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
22782 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
22783 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
22784 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm28[0],zmm7[2],zmm28[2],zmm7[4],zmm28[4],zmm7[6],zmm28[6]
22785 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
22786 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22787 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm3
22788 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm4
22789 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7]
22790 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
22791 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
22792 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm0
22793 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6]
22794 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
22795 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22796 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm1
22797 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm0
22798 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15]
22799 ; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
22800 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
22801 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm19
22802 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
22803 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
22804 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7]
22805 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22806 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm6
22807 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8]
22808 ; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
22809 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm17
22810 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22811 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9]
22812 ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
22813 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm6
22814 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22815 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm1
22816 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22817 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm0
22818 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22819 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22820 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm20
22821 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0
22822 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
22823 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm1
22824 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
22825 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
22826 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm11[1],zmm13[3],zmm11[3],zmm13[5],zmm11[5],zmm13[7],zmm11[7]
22827 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22828 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm1
22829 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm12, %zmm13
22830 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22831 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm1
22832 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22833 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm20
22834 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm0
22835 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22836 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22837 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm17
22838 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0
22839 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
22840 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm5, %zmm1
22841 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm10
22842 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm30[1],zmm26[1],zmm30[3],zmm26[3],zmm30[5],zmm26[5],zmm30[7],zmm26[7]
22843 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22844 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm1
22845 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm26
22846 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm12, %zmm26
22847 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm1
22848 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22849 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm17
22850 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm0
22851 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22852 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm19
22853 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm31
22854 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm3
22855 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm5, %zmm16
22856 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
22857 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm24[1],zmm6[1],zmm24[3],zmm6[3],zmm24[5],zmm6[5],zmm24[7],zmm6[7]
22858 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22859 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm0
22860 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm29
22861 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm29
22862 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0
22863 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22864 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm19
22865 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm31
22866 ; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
22867 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm16
22868 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm27
22869 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm1
22870 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7]
22871 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
22872 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm1
22873 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm24
22874 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm24
22875 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm1
22876 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22877 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm12, %zmm16
22878 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm2, %zmm27
22879 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22880 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm18
22881 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm30
22882 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
22883 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm1
22884 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22885 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
22886 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm10[1],zmm0[1],zmm10[3],zmm0[3],zmm10[5],zmm0[5],zmm10[7],zmm0[7]
22887 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22888 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm1
22889 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm10
22890 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22891 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
22892 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22893 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm18
22894 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm30
22895 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm6
22896 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm23
22897 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
22898 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm6
22899 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
22900 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
22901 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm21[1],zmm28[1],zmm21[3],zmm28[3],zmm21[5],zmm28[5],zmm21[7],zmm28[7]
22902 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm28
22903 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm1
22904 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm21
22905 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
22906 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22907 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm23
22908 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm22
22909 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
22910 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm0
22911 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
22912 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm0
22913 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22914 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm12
22915 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm3
22916 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm3
22917 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22918 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm2
22919 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm15
22920 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm14[1],zmm8[1],zmm14[3],zmm8[3],zmm14[5],zmm8[5],zmm14[7],zmm8[7]
22921 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm25
22922 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
22923 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload
22924 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22925 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload
22926 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
22927 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload
22928 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22929 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload
22930 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
22931 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 # 64-byte Folded Reload
22932 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
22933 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload
22934 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
22935 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload
22936 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
22937 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload
22938 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
22939 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload
22940 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
22941 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload
22942 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
22943 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload
22944 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
22945 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
22946 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22947 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
22948 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
22949 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22950 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
22951 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
22952 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22953 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
22954 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
22955 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
22956 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
22957 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
22958 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
22959 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
22960 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22961 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm8[4,5,6,7]
22962 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22963 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
22964 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22965 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7]
22966 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22967 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
22968 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22969 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5,6,7]
22970 ; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
22971 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
22972 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
22973 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5,6,7]
22974 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22975 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
22976 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22977 ; AVX512DQ-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload
22978 ; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,2,3],ymm15[4,5,6,7]
22979 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0
22980 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22981 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22982 ; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
22983 ; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
22984 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm25
22985 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm10[4,5,6,7]
22986 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22987 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
22988 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22989 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1}
22990 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm8
22991 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm8, %ymm8
22992 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
22993 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0
22994 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm0[0],ymm8[0],ymm0[2],ymm8[2]
22995 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm28, %zmm17, %zmm26
22996 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22997 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1}
22998 ; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %xmm13
22999 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm13, %ymm13
23000 ; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %xmm28
23001 ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 640(%rdi), %ymm28, %ymm28
23002 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm28[0],ymm13[0],ymm28[2],ymm13[2]
23003 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm1
23004 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23005 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1}
23006 ; AVX512DQ-FCP-NEXT: vmovdqa 1088(%rdi), %xmm4
23007 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 1216(%rdi), %ymm4, %ymm4
23008 ; AVX512DQ-FCP-NEXT: vmovdqa 1024(%rdi), %xmm7
23009 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 1152(%rdi), %ymm7, %ymm7
23010 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm7[0],ymm4[0],ymm7[2],ymm4[2]
23011 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm24
23012 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm19 {%k1}
23013 ; AVX512DQ-FCP-NEXT: vmovdqa 1600(%rdi), %xmm5
23014 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 1728(%rdi), %ymm5, %ymm5
23015 ; AVX512DQ-FCP-NEXT: vmovdqa 1536(%rdi), %xmm11
23016 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 1664(%rdi), %ymm11, %ymm11
23017 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm11[0],ymm5[0],ymm11[2],ymm5[2]
23018 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm29, %zmm19, %zmm19
23019 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1}
23020 ; AVX512DQ-FCP-NEXT: vmovdqa 2112(%rdi), %xmm6
23021 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 2240(%rdi), %ymm6, %ymm6
23022 ; AVX512DQ-FCP-NEXT: vmovdqa64 2048(%rdi), %xmm29
23023 ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm29, %ymm29
23024 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm29[0],ymm6[0],ymm29[2],ymm6[2]
23025 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3
23026 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23027 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1}
23028 ; AVX512DQ-FCP-NEXT: vmovdqa 2624(%rdi), %xmm10
23029 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 2752(%rdi), %ymm10, %ymm10
23030 ; AVX512DQ-FCP-NEXT: vmovdqa64 2560(%rdi), %xmm23
23031 ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm23, %ymm23
23032 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm23[0],ymm10[0],ymm23[2],ymm10[2]
23033 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm18, %zmm18
23034 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23035 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
23036 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1}
23037 ; AVX512DQ-FCP-NEXT: vmovdqa64 3136(%rdi), %xmm17
23038 ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 3264(%rdi), %ymm17, %ymm17
23039 ; AVX512DQ-FCP-NEXT: vmovdqa64 3072(%rdi), %xmm20
23040 ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20
23041 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm20[0],ymm17[0],ymm20[2],ymm17[2]
23042 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm1, %zmm15
23043 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23044 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1}
23045 ; AVX512DQ-FCP-NEXT: vmovdqa 3648(%rdi), %xmm9
23046 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 3776(%rdi), %ymm9, %ymm9
23047 ; AVX512DQ-FCP-NEXT: vmovdqa 3584(%rdi), %xmm14
23048 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 3712(%rdi), %ymm14, %ymm14
23049 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm14[0],ymm9[0],ymm14[2],ymm9[2]
23050 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm1
23051 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
23052 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
23053 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 {%k1}
23054 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm20[1],ymm17[1],ymm20[3],ymm17[3]
23055 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm16
23056 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
23057 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
23058 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1}
23059 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm28[1],ymm13[1],ymm28[3],ymm13[3]
23060 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm12, %zmm13
23061 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
23062 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
23063 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1}
23064 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3]
23065 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0
23066 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
23067 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm31 {%k1}
23068 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3]
23069 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm5
23070 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
23071 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1}
23072 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],ymm4[1],ymm7[3],ymm4[3]
23073 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4
23074 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
23075 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1}
23076 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm10[1],ymm23[3],ymm10[3]
23077 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm30, %zmm7
23078 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
23079 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1}
23080 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm29[1],ymm6[1],ymm29[3],ymm6[3]
23081 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6
23082 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
23083 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1}
23084 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm14[1],ymm9[1],ymm14[3],ymm9[3]
23085 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2
23086 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 448(%rsi)
23087 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 384(%rsi)
23088 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 320(%rsi)
23089 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 256(%rsi)
23090 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 192(%rsi)
23091 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 128(%rsi)
23092 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23093 ; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 64(%rsi)
23094 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, (%rsi)
23095 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 448(%rdx)
23096 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 256(%rdx)
23097 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 320(%rdx)
23098 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 128(%rdx)
23099 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 192(%rdx)
23100 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
23101 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 64(%rdx)
23102 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 384(%rdx)
23103 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23104 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 448(%rcx)
23105 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23106 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 256(%rcx)
23107 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23108 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 320(%rcx)
23109 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23110 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 128(%rcx)
23111 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23112 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 192(%rcx)
23113 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23114 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rcx)
23115 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23116 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%rcx)
23117 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23118 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 384(%rcx)
23119 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23120 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 448(%r8)
23121 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23122 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 256(%r8)
23123 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23124 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 320(%r8)
23125 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23126 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 128(%r8)
23127 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23128 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 192(%r8)
23129 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23130 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%r8)
23131 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23132 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%r8)
23133 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23134 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 384(%r8)
23135 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23136 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 448(%r9)
23137 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23138 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 256(%r9)
23139 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23140 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 320(%r9)
23141 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23142 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 128(%r9)
23143 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23144 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 192(%r9)
23145 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23146 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%r9)
23147 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23148 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%r9)
23149 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23150 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 384(%r9)
23151 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
23152 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23153 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 448(%rax)
23154 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23155 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 256(%rax)
23156 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23157 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 320(%rax)
23158 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23159 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 128(%rax)
23160 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23161 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 192(%rax)
23162 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23163 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rax)
23164 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23165 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%rax)
23166 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23167 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 384(%rax)
23168 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
23169 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23170 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 448(%rax)
23171 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23172 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 256(%rax)
23173 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23174 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 320(%rax)
23175 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23176 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 128(%rax)
23177 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23178 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 192(%rax)
23179 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23180 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rax)
23181 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23182 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%rax)
23183 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23184 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 384(%rax)
23185 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
23186 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23187 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 384(%rax)
23188 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 448(%rax)
23189 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23190 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 256(%rax)
23191 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23192 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 320(%rax)
23193 ; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
23194 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 128(%rax)
23195 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23196 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 192(%rax)
23197 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23198 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rax)
23199 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23200 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%rax)
23201 ; AVX512DQ-FCP-NEXT: addq $6664, %rsp # imm = 0x1A08
23202 ; AVX512DQ-FCP-NEXT: vzeroupper
23203 ; AVX512DQ-FCP-NEXT: retq
23205 ; AVX512BW-LABEL: load_i64_stride8_vf64:
23206 ; AVX512BW: # %bb.0:
23207 ; AVX512BW-NEXT: subq $6664, %rsp # imm = 0x1A08
23208 ; AVX512BW-NEXT: vmovdqa64 3392(%rdi), %zmm3
23209 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23210 ; AVX512BW-NEXT: vmovdqa64 3328(%rdi), %zmm16
23211 ; AVX512BW-NEXT: vmovdqa64 3520(%rdi), %zmm8
23212 ; AVX512BW-NEXT: vmovdqa64 3456(%rdi), %zmm28
23213 ; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm10
23214 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23215 ; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm11
23216 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23217 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm4
23218 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23219 ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm5
23220 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23221 ; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm6
23222 ; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm9
23223 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23224 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm12
23225 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23226 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm15
23227 ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23228 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm13
23229 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23230 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm14
23231 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23232 ; AVX512BW-NEXT: movb $-64, %al
23233 ; AVX512BW-NEXT: kmovd %eax, %k1
23234 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
23235 ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
23236 ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0
23237 ; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23238 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm0
23239 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23240 ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1
23241 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
23242 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
23243 ; AVX512BW-NEXT: vmovdqa 3264(%rdi), %ymm3
23244 ; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
23245 ; AVX512BW-NEXT: vmovdqa 3200(%rdi), %ymm0
23246 ; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
23247 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
23248 ; AVX512BW-NEXT: vmovdqa 3136(%rdi), %ymm3
23249 ; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
23250 ; AVX512BW-NEXT: vmovdqa 3072(%rdi), %ymm7
23251 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2]
23252 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
23253 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
23254 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23255 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0
23256 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0
23257 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9
23258 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23259 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1
23260 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm1
23261 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
23262 ; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm0
23263 ; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
23264 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %ymm20
23265 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm20[0],ymm0[0],ymm20[2],ymm0[2]
23266 ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %ymm22
23267 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm19
23268 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2]
23269 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
23270 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
23271 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23272 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0
23273 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm0
23274 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1
23275 ; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm1
23276 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
23277 ; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm14
23278 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm15
23279 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
23280 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm21
23281 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm13
23282 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2]
23283 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
23284 ; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm3
23285 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23286 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
23287 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23288 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0
23289 ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm0
23290 ; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm1
23291 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23292 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm1
23293 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
23294 ; AVX512BW-NEXT: vmovdqa 1728(%rdi), %ymm3
23295 ; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
23296 ; AVX512BW-NEXT: vmovdqa 1664(%rdi), %ymm0
23297 ; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
23298 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
23299 ; AVX512BW-NEXT: vmovdqa 1600(%rdi), %ymm4
23300 ; AVX512BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
23301 ; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm3
23302 ; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
23303 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
23304 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
23305 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
23306 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23307 ; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm1
23308 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23309 ; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm0
23310 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23311 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
23312 ; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm3
23313 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23314 ; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm1
23315 ; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
23316 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
23317 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
23318 ; AVX512BW-NEXT: vmovdqa 1216(%rdi), %ymm3
23319 ; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
23320 ; AVX512BW-NEXT: vmovdqa 1152(%rdi), %ymm0
23321 ; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
23322 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
23323 ; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm4
23324 ; AVX512BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
23325 ; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm3
23326 ; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
23327 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
23328 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
23329 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
23330 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23331 ; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm1
23332 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23333 ; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm29
23334 ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0
23335 ; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23336 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
23337 ; AVX512BW-NEXT: vmovdqa64 2880(%rdi), %zmm24
23338 ; AVX512BW-NEXT: vmovdqa64 2816(%rdi), %zmm25
23339 ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1
23340 ; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23341 ; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm1
23342 ; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23343 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
23344 ; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %ymm27
23345 ; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %ymm26
23346 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm27[0],ymm26[2],ymm27[2]
23347 ; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %ymm30
23348 ; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %ymm18
23349 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm30[0],ymm18[2],ymm30[2]
23350 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
23351 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
23352 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23353 ; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm1
23354 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23355 ; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm31
23356 ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0
23357 ; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23358 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
23359 ; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm3
23360 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23361 ; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm1
23362 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23363 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
23364 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
23365 ; AVX512BW-NEXT: vmovdqa 2240(%rdi), %ymm12
23366 ; AVX512BW-NEXT: vmovdqa 2176(%rdi), %ymm11
23367 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2]
23368 ; AVX512BW-NEXT: vmovdqa 2112(%rdi), %ymm10
23369 ; AVX512BW-NEXT: vmovdqa 2048(%rdi), %ymm3
23370 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm10[0],ymm3[2],ymm10[2]
23371 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3]
23372 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
23373 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23374 ; AVX512BW-NEXT: vmovdqa64 4032(%rdi), %zmm1
23375 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23376 ; AVX512BW-NEXT: vmovdqa64 3968(%rdi), %zmm0
23377 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23378 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
23379 ; AVX512BW-NEXT: vmovdqa64 3904(%rdi), %zmm1
23380 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23381 ; AVX512BW-NEXT: vmovdqa64 3840(%rdi), %zmm4
23382 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23383 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm2
23384 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
23385 ; AVX512BW-NEXT: vmovdqa64 3776(%rdi), %ymm17
23386 ; AVX512BW-NEXT: vmovdqa64 3712(%rdi), %ymm23
23387 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm23[0],ymm17[0],ymm23[2],ymm17[2]
23388 ; AVX512BW-NEXT: vmovdqa 3648(%rdi), %ymm1
23389 ; AVX512BW-NEXT: vmovdqa 3584(%rdi), %ymm0
23390 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
23391 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3]
23392 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
23393 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23394 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
23395 ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
23396 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm28
23397 ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm8
23398 ; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23399 ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm6
23400 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
23401 ; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm6
23402 ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm6 {%k1}
23403 ; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
23404 ; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload
23405 ; AVX512BW-NEXT: # ymm5 = ymm4[1],mem[1],ymm4[3],mem[3]
23406 ; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
23407 ; AVX512BW-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
23408 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
23409 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
23410 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23411 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
23412 ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm5
23413 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm5
23414 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
23415 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6
23416 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
23417 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm6
23418 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
23419 ; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload
23420 ; AVX512BW-NEXT: # ymm5 = ymm20[1],mem[1],ymm20[3],mem[3]
23421 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm19[1],ymm22[1],ymm19[3],ymm22[3]
23422 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
23423 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
23424 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23425 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
23426 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm5
23427 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
23428 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm5
23429 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
23430 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
23431 ; AVX512BW-NEXT: vpermt2q %zmm22, %zmm2, %zmm6
23432 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
23433 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
23434 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
23435 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
23436 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
23437 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23438 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
23439 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm5
23440 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
23441 ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm5
23442 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
23443 ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm6
23444 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
23445 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm6
23446 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
23447 ; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
23448 ; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
23449 ; AVX512BW-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
23450 ; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
23451 ; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
23452 ; AVX512BW-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
23453 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
23454 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
23455 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23456 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
23457 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload
23458 ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
23459 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload
23460 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
23461 ; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
23462 ; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
23463 ; AVX512BW-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
23464 ; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
23465 ; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
23466 ; AVX512BW-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
23467 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
23468 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
23469 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23470 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm29 # 64-byte Folded Reload
23471 ; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm25
23472 ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1}
23473 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm27[1],ymm26[3],ymm27[3]
23474 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm18[1],ymm30[1],ymm18[3],ymm30[3]
23475 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
23476 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm25, %zmm5
23477 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23478 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
23479 ; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm31
23480 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
23481 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm6
23482 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload
23483 ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1}
23484 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm12[1],ymm11[3],ymm12[3]
23485 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm10[1],ymm3[3],ymm10[3]
23486 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3]
23487 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3
23488 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23489 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
23490 ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3
23491 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
23492 ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm3
23493 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
23494 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
23495 ; AVX512BW-NEXT: vpermi2q %zmm11, %zmm12, %zmm2
23496 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1}
23497 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm17[1],ymm23[3],ymm17[3]
23498 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
23499 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
23500 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
23501 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23502 ; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %zmm3
23503 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23504 ; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %zmm10
23505 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
23506 ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
23507 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1
23508 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1
23509 ; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %zmm23
23510 ; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %zmm2
23511 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23512 ; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2
23513 ; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23514 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
23515 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
23516 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2
23517 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
23518 ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2
23519 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6]
23520 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
23521 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
23522 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23523 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm2
23524 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23525 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm1
23526 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23527 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
23528 ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm3
23529 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23530 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2
23531 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23532 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
23533 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
23534 ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2
23535 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
23536 ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2
23537 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm4[0],zmm9[2],zmm4[2],zmm9[4],zmm4[4],zmm9[6],zmm4[6]
23538 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
23539 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
23540 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23541 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm2
23542 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23543 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1
23544 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23545 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
23546 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3
23547 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23548 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2
23549 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23550 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
23551 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
23552 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm2
23553 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2
23554 ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm16
23555 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
23556 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm22[0],zmm3[2],zmm22[2],zmm3[4],zmm22[4],zmm3[6],zmm22[6]
23557 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
23558 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
23559 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23560 ; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm2
23561 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23562 ; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm1
23563 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23564 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
23565 ; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm3
23566 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23567 ; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm2
23568 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23569 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
23570 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
23571 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2
23572 ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
23573 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm13[0],zmm21[2],zmm13[2],zmm21[4],zmm13[4],zmm21[6],zmm13[6]
23574 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
23575 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
23576 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23577 ; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm2
23578 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23579 ; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm1
23580 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23581 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
23582 ; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm3
23583 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23584 ; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm2
23585 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23586 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
23587 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
23588 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
23589 ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2
23590 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
23591 ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload
23592 ; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload
23593 ; AVX512BW-NEXT: # zmm3 = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6]
23594 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
23595 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
23596 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23597 ; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm3
23598 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23599 ; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm8
23600 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1
23601 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23602 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1
23603 ; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm3
23604 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23605 ; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm2
23606 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23607 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
23608 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
23609 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
23610 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2
23611 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
23612 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
23613 ; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload
23614 ; AVX512BW-NEXT: # zmm3 = zmm7[0],mem[0],zmm7[2],mem[2],zmm7[4],mem[4],zmm7[6],mem[6]
23615 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
23616 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
23617 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23618 ; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm2
23619 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23620 ; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm1
23621 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23622 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
23623 ; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm3
23624 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23625 ; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm2
23626 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23627 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
23628 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
23629 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
23630 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2
23631 ; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
23632 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
23633 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm26[0],zmm4[0],zmm26[2],zmm4[2],zmm26[4],zmm4[4],zmm26[6],zmm4[6]
23634 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
23635 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
23636 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23637 ; AVX512BW-NEXT: vmovdqa64 3776(%rdi), %zmm2
23638 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23639 ; AVX512BW-NEXT: vmovdqa64 3712(%rdi), %zmm1
23640 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23641 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
23642 ; AVX512BW-NEXT: vmovdqa64 3648(%rdi), %zmm3
23643 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23644 ; AVX512BW-NEXT: vmovdqa64 3584(%rdi), %zmm2
23645 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23646 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
23647 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
23648 ; AVX512BW-NEXT: vpermi2q %zmm18, %zmm29, %zmm0
23649 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm12[0],zmm11[0],zmm12[2],zmm11[2],zmm12[4],zmm11[4],zmm12[6],zmm11[6]
23650 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
23651 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0
23652 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23653 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
23654 ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
23655 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23656 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
23657 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm10
23658 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
23659 ; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2
23660 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7]
23661 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2
23662 ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2
23663 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
23664 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
23665 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
23666 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
23667 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
23668 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23669 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23670 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
23671 ; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm1
23672 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
23673 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
23674 ; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
23675 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
23676 ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm29
23677 ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2
23678 ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2
23679 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
23680 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
23681 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7]
23682 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
23683 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
23684 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23685 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23686 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
23687 ; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1
23688 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
23689 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
23690 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2
23691 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
23692 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
23693 ; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
23694 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
23695 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
23696 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm12[1],zmm18[1],zmm12[3],zmm18[3],zmm12[5],zmm18[5],zmm12[7],zmm18[7]
23697 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
23698 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
23699 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23700 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23701 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
23702 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
23703 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
23704 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
23705 ; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
23706 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
23707 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2
23708 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
23709 ; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2
23710 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
23711 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm13[1],zmm3[3],zmm13[3],zmm3[5],zmm13[5],zmm3[7],zmm13[7]
23712 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
23713 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
23714 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23715 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23716 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
23717 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm1
23718 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
23719 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
23720 ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2
23721 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
23722 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload
23723 ; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload
23724 ; AVX512BW-NEXT: # zmm3 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7]
23725 ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1}
23726 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
23727 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23728 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
23729 ; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm8
23730 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
23731 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
23732 ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
23733 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm8[4,5,6,7]
23734 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload
23735 ; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload
23736 ; AVX512BW-NEXT: # zmm3 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7]
23737 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1}
23738 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
23739 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23740 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23741 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
23742 ; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm1
23743 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
23744 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2
23745 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
23746 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm2
23747 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
23748 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
23749 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
23750 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7]
23751 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1}
23752 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
23753 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23754 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
23755 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1
23756 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
23757 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1
23758 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
23759 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2
23760 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
23761 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2
23762 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
23763 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
23764 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
23765 ; AVX512BW-NEXT: vpermi2q %zmm10, %zmm9, %zmm0
23766 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
23767 ; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload
23768 ; AVX512BW-NEXT: # zmm2 = zmm2[1],mem[1],zmm2[3],mem[3],zmm2[5],mem[5],zmm2[7],mem[7]
23769 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
23770 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0
23771 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23772 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
23773 ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
23774 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23775 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm1
23776 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
23777 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
23778 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
23779 ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2
23780 ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2
23781 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
23782 ; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 {%k1} # 64-byte Folded Reload
23783 ; AVX512BW-NEXT: # zmm2 {%k1} = zmm25[0],mem[0],zmm25[2],mem[2],zmm25[4],mem[4],zmm25[6],mem[6]
23784 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
23785 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23786 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23787 ; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm1
23788 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
23789 ; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
23790 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
23791 ; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm15
23792 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm29[0],zmm30[0],zmm29[2],zmm30[2],zmm29[4],zmm30[4],zmm29[6],zmm30[6]
23793 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1
23794 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23795 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23796 ; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1
23797 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
23798 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2
23799 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
23800 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2
23801 ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2
23802 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
23803 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
23804 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm30[0],zmm26[0],zmm30[2],zmm26[2],zmm30[4],zmm26[4],zmm30[6],zmm26[6]
23805 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
23806 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23807 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23808 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
23809 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
23810 ; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
23811 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
23812 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
23813 ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2
23814 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
23815 ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm2
23816 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
23817 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm24[0],zmm23[0],zmm24[2],zmm23[2],zmm24[4],zmm23[4],zmm24[6],zmm23[6]
23818 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
23819 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23820 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23821 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm1
23822 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
23823 ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2
23824 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
23825 ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
23826 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
23827 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2
23828 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
23829 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
23830 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
23831 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
23832 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23833 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23834 ; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm1
23835 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
23836 ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
23837 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
23838 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
23839 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
23840 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
23841 ; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm2 {%k1} # 64-byte Folded Reload
23842 ; AVX512BW-NEXT: # zmm2 {%k1} = zmm11[0],mem[0],zmm11[2],mem[2],zmm11[4],mem[4],zmm11[6],mem[6]
23843 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
23844 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23845 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23846 ; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm1
23847 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm7
23848 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
23849 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
23850 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm2
23851 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
23852 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
23853 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
23854 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm28[0],zmm7[2],zmm28[2],zmm7[4],zmm28[4],zmm7[6],zmm28[6]
23855 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
23856 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23857 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3
23858 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm4
23859 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7]
23860 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
23861 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
23862 ; AVX512BW-NEXT: vpermi2q %zmm25, %zmm15, %zmm0
23863 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6]
23864 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
23865 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23866 ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1
23867 ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0
23868 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15]
23869 ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
23870 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
23871 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm19
23872 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
23873 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
23874 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7]
23875 ; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23876 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm6
23877 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8]
23878 ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
23879 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm17
23880 ; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23881 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9]
23882 ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
23883 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm6
23884 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23885 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm1
23886 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23887 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0
23888 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23889 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23890 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20
23891 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
23892 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
23893 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm1
23894 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
23895 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
23896 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm11[1],zmm13[3],zmm11[3],zmm13[5],zmm11[5],zmm13[7],zmm11[7]
23897 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23898 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1
23899 ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm12, %zmm13
23900 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23901 ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm1
23902 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23903 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm20
23904 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm0
23905 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23906 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23907 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17
23908 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
23909 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
23910 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm5, %zmm1
23911 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm10
23912 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm30[1],zmm26[1],zmm30[3],zmm26[3],zmm30[5],zmm26[5],zmm30[7],zmm26[7]
23913 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23914 ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1
23915 ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm26
23916 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm12, %zmm26
23917 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm1
23918 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23919 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm12, %zmm17
23920 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0
23921 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23922 ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm19
23923 ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm31
23924 ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3
23925 ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm5, %zmm16
23926 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
23927 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm24[1],zmm6[1],zmm24[3],zmm6[3],zmm24[5],zmm6[5],zmm24[7],zmm6[7]
23928 ; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23929 ; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm0
23930 ; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm29
23931 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm29
23932 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0
23933 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23934 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm19
23935 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm31
23936 ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
23937 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm16
23938 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27
23939 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm5, %zmm1
23940 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7]
23941 ; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
23942 ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm1
23943 ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm24
23944 ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm24
23945 ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm1
23946 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23947 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm12, %zmm16
23948 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm2, %zmm27
23949 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23950 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18
23951 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30
23952 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
23953 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm1
23954 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23955 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
23956 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm10[1],zmm0[1],zmm10[3],zmm0[3],zmm10[5],zmm0[5],zmm10[7],zmm0[7]
23957 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23958 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1
23959 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm10
23960 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23961 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
23962 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23963 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm18
23964 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm30
23965 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm6
23966 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm23
23967 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
23968 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm6
23969 ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0
23970 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
23971 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm21[1],zmm28[1],zmm21[3],zmm28[3],zmm21[5],zmm28[5],zmm21[7],zmm28[7]
23972 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm28
23973 ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm1
23974 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm21
23975 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
23976 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23977 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm23
23978 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm22
23979 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
23980 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0
23981 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
23982 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm0
23983 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23984 ; AVX512BW-NEXT: vpermi2q %zmm25, %zmm15, %zmm12
23985 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3
23986 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm3
23987 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23988 ; AVX512BW-NEXT: vpermi2q %zmm25, %zmm15, %zmm2
23989 ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm5, %zmm15
23990 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm14[1],zmm8[1],zmm14[3],zmm8[3],zmm14[5],zmm8[5],zmm14[7],zmm8[7]
23991 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm25
23992 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
23993 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload
23994 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23995 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload
23996 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
23997 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload
23998 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23999 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload
24000 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
24001 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 # 64-byte Folded Reload
24002 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
24003 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload
24004 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
24005 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload
24006 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
24007 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload
24008 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
24009 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload
24010 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
24011 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload
24012 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
24013 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload
24014 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
24015 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
24016 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24017 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
24018 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
24019 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24020 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
24021 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
24022 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24023 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
24024 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
24025 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
24026 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
24027 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
24028 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
24029 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
24030 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24031 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm8[4,5,6,7]
24032 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24033 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
24034 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24035 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7]
24036 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24037 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
24038 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24039 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5,6,7]
24040 ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
24041 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
24042 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
24043 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5,6,7]
24044 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24045 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
24046 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24047 ; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload
24048 ; AVX512BW-NEXT: # ymm0 = mem[0,1,2,3],ymm15[4,5,6,7]
24049 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0
24050 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24051 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24052 ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
24053 ; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
24054 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm25
24055 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm10[4,5,6,7]
24056 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24057 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
24058 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24059 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1}
24060 ; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm8
24061 ; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm8, %ymm8
24062 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
24063 ; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0
24064 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm0[0],ymm8[0],ymm0[2],ymm8[2]
24065 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm28, %zmm17, %zmm26
24066 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24067 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1}
24068 ; AVX512BW-NEXT: vmovdqa 576(%rdi), %xmm13
24069 ; AVX512BW-NEXT: vinserti128 $1, 704(%rdi), %ymm13, %ymm13
24070 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %xmm28
24071 ; AVX512BW-NEXT: vinserti32x4 $1, 640(%rdi), %ymm28, %ymm28
24072 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm28[0],ymm13[0],ymm28[2],ymm13[2]
24073 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm1
24074 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24075 ; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1}
24076 ; AVX512BW-NEXT: vmovdqa 1088(%rdi), %xmm4
24077 ; AVX512BW-NEXT: vinserti128 $1, 1216(%rdi), %ymm4, %ymm4
24078 ; AVX512BW-NEXT: vmovdqa 1024(%rdi), %xmm7
24079 ; AVX512BW-NEXT: vinserti128 $1, 1152(%rdi), %ymm7, %ymm7
24080 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm7[0],ymm4[0],ymm7[2],ymm4[2]
24081 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm24
24082 ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm19 {%k1}
24083 ; AVX512BW-NEXT: vmovdqa 1600(%rdi), %xmm5
24084 ; AVX512BW-NEXT: vinserti128 $1, 1728(%rdi), %ymm5, %ymm5
24085 ; AVX512BW-NEXT: vmovdqa 1536(%rdi), %xmm11
24086 ; AVX512BW-NEXT: vinserti128 $1, 1664(%rdi), %ymm11, %ymm11
24087 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm11[0],ymm5[0],ymm11[2],ymm5[2]
24088 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm29, %zmm19, %zmm19
24089 ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1}
24090 ; AVX512BW-NEXT: vmovdqa 2112(%rdi), %xmm6
24091 ; AVX512BW-NEXT: vinserti128 $1, 2240(%rdi), %ymm6, %ymm6
24092 ; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %xmm29
24093 ; AVX512BW-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm29, %ymm29
24094 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm29[0],ymm6[0],ymm29[2],ymm6[2]
24095 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3
24096 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24097 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1}
24098 ; AVX512BW-NEXT: vmovdqa 2624(%rdi), %xmm10
24099 ; AVX512BW-NEXT: vinserti128 $1, 2752(%rdi), %ymm10, %ymm10
24100 ; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %xmm23
24101 ; AVX512BW-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm23, %ymm23
24102 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm23[0],ymm10[0],ymm23[2],ymm10[2]
24103 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm20, %zmm18, %zmm18
24104 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24105 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
24106 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1}
24107 ; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %xmm17
24108 ; AVX512BW-NEXT: vinserti32x4 $1, 3264(%rdi), %ymm17, %ymm17
24109 ; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %xmm20
24110 ; AVX512BW-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20
24111 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm20[0],ymm17[0],ymm20[2],ymm17[2]
24112 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm1, %zmm15
24113 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24114 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1}
24115 ; AVX512BW-NEXT: vmovdqa 3648(%rdi), %xmm9
24116 ; AVX512BW-NEXT: vinserti128 $1, 3776(%rdi), %ymm9, %ymm9
24117 ; AVX512BW-NEXT: vmovdqa 3584(%rdi), %xmm14
24118 ; AVX512BW-NEXT: vinserti128 $1, 3712(%rdi), %ymm14, %ymm14
24119 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm14[0],ymm9[0],ymm14[2],ymm9[2]
24120 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm1
24121 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
24122 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
24123 ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm12 {%k1}
24124 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm20[1],ymm17[1],ymm20[3],ymm17[3]
24125 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm16
24126 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
24127 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
24128 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1}
24129 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm28[1],ymm13[1],ymm28[3],ymm13[3]
24130 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm12, %zmm13
24131 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
24132 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
24133 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1}
24134 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3]
24135 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0
24136 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
24137 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm31 {%k1}
24138 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3]
24139 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm5
24140 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
24141 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1}
24142 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],ymm4[1],ymm7[3],ymm4[3]
24143 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4
24144 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
24145 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1}
24146 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm10[1],ymm23[3],ymm10[3]
24147 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm30, %zmm7
24148 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
24149 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1}
24150 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm29[1],ymm6[1],ymm29[3],ymm6[3]
24151 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6
24152 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
24153 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1}
24154 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm14[1],ymm9[1],ymm14[3],ymm9[3]
24155 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2
24156 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 448(%rsi)
24157 ; AVX512BW-NEXT: vmovdqa64 %zmm15, 384(%rsi)
24158 ; AVX512BW-NEXT: vmovdqa64 %zmm18, 320(%rsi)
24159 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 256(%rsi)
24160 ; AVX512BW-NEXT: vmovdqa64 %zmm19, 192(%rsi)
24161 ; AVX512BW-NEXT: vmovdqa64 %zmm24, 128(%rsi)
24162 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24163 ; AVX512BW-NEXT: vmovaps %zmm1, 64(%rsi)
24164 ; AVX512BW-NEXT: vmovdqa64 %zmm26, (%rsi)
24165 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 448(%rdx)
24166 ; AVX512BW-NEXT: vmovdqa64 %zmm6, 256(%rdx)
24167 ; AVX512BW-NEXT: vmovdqa64 %zmm7, 320(%rdx)
24168 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rdx)
24169 ; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rdx)
24170 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
24171 ; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rdx)
24172 ; AVX512BW-NEXT: vmovdqa64 %zmm16, 384(%rdx)
24173 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24174 ; AVX512BW-NEXT: vmovaps %zmm0, 448(%rcx)
24175 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24176 ; AVX512BW-NEXT: vmovaps %zmm0, 256(%rcx)
24177 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24178 ; AVX512BW-NEXT: vmovaps %zmm0, 320(%rcx)
24179 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24180 ; AVX512BW-NEXT: vmovaps %zmm0, 128(%rcx)
24181 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24182 ; AVX512BW-NEXT: vmovaps %zmm0, 192(%rcx)
24183 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24184 ; AVX512BW-NEXT: vmovaps %zmm0, (%rcx)
24185 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24186 ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rcx)
24187 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24188 ; AVX512BW-NEXT: vmovaps %zmm0, 384(%rcx)
24189 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24190 ; AVX512BW-NEXT: vmovaps %zmm0, 448(%r8)
24191 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24192 ; AVX512BW-NEXT: vmovaps %zmm0, 256(%r8)
24193 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24194 ; AVX512BW-NEXT: vmovaps %zmm0, 320(%r8)
24195 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24196 ; AVX512BW-NEXT: vmovaps %zmm0, 128(%r8)
24197 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24198 ; AVX512BW-NEXT: vmovaps %zmm0, 192(%r8)
24199 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24200 ; AVX512BW-NEXT: vmovaps %zmm0, (%r8)
24201 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24202 ; AVX512BW-NEXT: vmovaps %zmm0, 64(%r8)
24203 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24204 ; AVX512BW-NEXT: vmovaps %zmm0, 384(%r8)
24205 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24206 ; AVX512BW-NEXT: vmovaps %zmm0, 448(%r9)
24207 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24208 ; AVX512BW-NEXT: vmovaps %zmm0, 256(%r9)
24209 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24210 ; AVX512BW-NEXT: vmovaps %zmm0, 320(%r9)
24211 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24212 ; AVX512BW-NEXT: vmovaps %zmm0, 128(%r9)
24213 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24214 ; AVX512BW-NEXT: vmovaps %zmm0, 192(%r9)
24215 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24216 ; AVX512BW-NEXT: vmovaps %zmm0, (%r9)
24217 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24218 ; AVX512BW-NEXT: vmovaps %zmm0, 64(%r9)
24219 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24220 ; AVX512BW-NEXT: vmovaps %zmm0, 384(%r9)
24221 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
24222 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24223 ; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax)
24224 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24225 ; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax)
24226 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24227 ; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax)
24228 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24229 ; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax)
24230 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24231 ; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax)
24232 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24233 ; AVX512BW-NEXT: vmovaps %zmm0, (%rax)
24234 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24235 ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax)
24236 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24237 ; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax)
24238 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
24239 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24240 ; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax)
24241 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24242 ; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax)
24243 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24244 ; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax)
24245 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24246 ; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax)
24247 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24248 ; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax)
24249 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24250 ; AVX512BW-NEXT: vmovaps %zmm0, (%rax)
24251 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24252 ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax)
24253 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24254 ; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax)
24255 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
24256 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24257 ; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax)
24258 ; AVX512BW-NEXT: vmovdqa64 %zmm25, 448(%rax)
24259 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24260 ; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax)
24261 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24262 ; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax)
24263 ; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
24264 ; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax)
24265 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24266 ; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax)
24267 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24268 ; AVX512BW-NEXT: vmovaps %zmm0, (%rax)
24269 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24270 ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax)
24271 ; AVX512BW-NEXT: addq $6664, %rsp # imm = 0x1A08
24272 ; AVX512BW-NEXT: vzeroupper
24273 ; AVX512BW-NEXT: retq
24275 ; AVX512BW-FCP-LABEL: load_i64_stride8_vf64:
24276 ; AVX512BW-FCP: # %bb.0:
24277 ; AVX512BW-FCP-NEXT: subq $6664, %rsp # imm = 0x1A08
24278 ; AVX512BW-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm3
24279 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24280 ; AVX512BW-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm16
24281 ; AVX512BW-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm8
24282 ; AVX512BW-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm28
24283 ; AVX512BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm10
24284 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24285 ; AVX512BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm11
24286 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24287 ; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4
24288 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24289 ; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5
24290 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24291 ; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm6
24292 ; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm9
24293 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24294 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm12
24295 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24296 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15
24297 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24298 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm13
24299 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24300 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14
24301 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24302 ; AVX512BW-FCP-NEXT: movb $-64, %al
24303 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1
24304 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
24305 ; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
24306 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
24307 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24308 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm0
24309 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24310 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1
24311 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
24312 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
24313 ; AVX512BW-FCP-NEXT: vmovdqa 3264(%rdi), %ymm3
24314 ; AVX512BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
24315 ; AVX512BW-FCP-NEXT: vmovdqa 3200(%rdi), %ymm0
24316 ; AVX512BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
24317 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
24318 ; AVX512BW-FCP-NEXT: vmovdqa 3136(%rdi), %ymm3
24319 ; AVX512BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
24320 ; AVX512BW-FCP-NEXT: vmovdqa 3072(%rdi), %ymm7
24321 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2]
24322 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
24323 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
24324 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24325 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0
24326 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0
24327 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9
24328 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24329 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1
24330 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm1
24331 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
24332 ; AVX512BW-FCP-NEXT: vmovdqa 704(%rdi), %ymm0
24333 ; AVX512BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
24334 ; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %ymm20
24335 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm20[0],ymm0[0],ymm20[2],ymm0[2]
24336 ; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %ymm22
24337 ; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %ymm19
24338 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2]
24339 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
24340 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
24341 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24342 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm0
24343 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm0
24344 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1
24345 ; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm1
24346 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
24347 ; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm14
24348 ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm15
24349 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
24350 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm21
24351 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm13
24352 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2]
24353 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
24354 ; AVX512BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3
24355 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24356 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
24357 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24358 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0
24359 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm0
24360 ; AVX512BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm1
24361 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24362 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm1
24363 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
24364 ; AVX512BW-FCP-NEXT: vmovdqa 1728(%rdi), %ymm3
24365 ; AVX512BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
24366 ; AVX512BW-FCP-NEXT: vmovdqa 1664(%rdi), %ymm0
24367 ; AVX512BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
24368 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
24369 ; AVX512BW-FCP-NEXT: vmovdqa 1600(%rdi), %ymm4
24370 ; AVX512BW-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
24371 ; AVX512BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm3
24372 ; AVX512BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
24373 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
24374 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
24375 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
24376 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24377 ; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm1
24378 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24379 ; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0
24380 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24381 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
24382 ; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm3
24383 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24384 ; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm1
24385 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
24386 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
24387 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
24388 ; AVX512BW-FCP-NEXT: vmovdqa 1216(%rdi), %ymm3
24389 ; AVX512BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
24390 ; AVX512BW-FCP-NEXT: vmovdqa 1152(%rdi), %ymm0
24391 ; AVX512BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
24392 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
24393 ; AVX512BW-FCP-NEXT: vmovdqa 1088(%rdi), %ymm4
24394 ; AVX512BW-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
24395 ; AVX512BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm3
24396 ; AVX512BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
24397 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
24398 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
24399 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
24400 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24401 ; AVX512BW-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm1
24402 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24403 ; AVX512BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm29
24404 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0
24405 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24406 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
24407 ; AVX512BW-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm24
24408 ; AVX512BW-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm25
24409 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1
24410 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24411 ; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm1
24412 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24413 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
24414 ; AVX512BW-FCP-NEXT: vmovdqa64 2752(%rdi), %ymm27
24415 ; AVX512BW-FCP-NEXT: vmovdqa64 2688(%rdi), %ymm26
24416 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm27[0],ymm26[2],ymm27[2]
24417 ; AVX512BW-FCP-NEXT: vmovdqa64 2624(%rdi), %ymm30
24418 ; AVX512BW-FCP-NEXT: vmovdqa64 2560(%rdi), %ymm18
24419 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm30[0],ymm18[2],ymm30[2]
24420 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
24421 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
24422 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24423 ; AVX512BW-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm1
24424 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24425 ; AVX512BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm31
24426 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0
24427 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24428 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
24429 ; AVX512BW-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm3
24430 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24431 ; AVX512BW-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm1
24432 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24433 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
24434 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
24435 ; AVX512BW-FCP-NEXT: vmovdqa 2240(%rdi), %ymm12
24436 ; AVX512BW-FCP-NEXT: vmovdqa 2176(%rdi), %ymm11
24437 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2]
24438 ; AVX512BW-FCP-NEXT: vmovdqa 2112(%rdi), %ymm10
24439 ; AVX512BW-FCP-NEXT: vmovdqa 2048(%rdi), %ymm3
24440 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm10[0],ymm3[2],ymm10[2]
24441 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3]
24442 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
24443 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24444 ; AVX512BW-FCP-NEXT: vmovdqa64 4032(%rdi), %zmm1
24445 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24446 ; AVX512BW-FCP-NEXT: vmovdqa64 3968(%rdi), %zmm0
24447 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24448 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
24449 ; AVX512BW-FCP-NEXT: vmovdqa64 3904(%rdi), %zmm1
24450 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24451 ; AVX512BW-FCP-NEXT: vmovdqa64 3840(%rdi), %zmm4
24452 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24453 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm2
24454 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
24455 ; AVX512BW-FCP-NEXT: vmovdqa64 3776(%rdi), %ymm17
24456 ; AVX512BW-FCP-NEXT: vmovdqa64 3712(%rdi), %ymm23
24457 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm23[0],ymm17[0],ymm23[2],ymm17[2]
24458 ; AVX512BW-FCP-NEXT: vmovdqa 3648(%rdi), %ymm1
24459 ; AVX512BW-FCP-NEXT: vmovdqa 3584(%rdi), %ymm0
24460 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
24461 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3]
24462 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
24463 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24464 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
24465 ; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
24466 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm28
24467 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm8
24468 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24469 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm6
24470 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
24471 ; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm6
24472 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm6 {%k1}
24473 ; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
24474 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload
24475 ; AVX512BW-FCP-NEXT: # ymm5 = ymm4[1],mem[1],ymm4[3],mem[3]
24476 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
24477 ; AVX512BW-FCP-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
24478 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
24479 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
24480 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24481 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
24482 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm5
24483 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm5
24484 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
24485 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm6
24486 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
24487 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm6
24488 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
24489 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload
24490 ; AVX512BW-FCP-NEXT: # ymm5 = ymm20[1],mem[1],ymm20[3],mem[3]
24491 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm19[1],ymm22[1],ymm19[3],ymm22[3]
24492 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
24493 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
24494 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24495 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
24496 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm5
24497 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
24498 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm5
24499 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
24500 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
24501 ; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm2, %zmm6
24502 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
24503 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
24504 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
24505 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
24506 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
24507 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24508 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
24509 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm5
24510 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
24511 ; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm5
24512 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
24513 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm6
24514 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
24515 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm6
24516 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
24517 ; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
24518 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
24519 ; AVX512BW-FCP-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
24520 ; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
24521 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
24522 ; AVX512BW-FCP-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
24523 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
24524 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
24525 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24526 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
24527 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload
24528 ; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
24529 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload
24530 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
24531 ; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
24532 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
24533 ; AVX512BW-FCP-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
24534 ; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
24535 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
24536 ; AVX512BW-FCP-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
24537 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
24538 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
24539 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24540 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm29 # 64-byte Folded Reload
24541 ; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm25
24542 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1}
24543 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm27[1],ymm26[3],ymm27[3]
24544 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm18[1],ymm30[1],ymm18[3],ymm30[3]
24545 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
24546 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm25, %zmm5
24547 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24548 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
24549 ; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm31
24550 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
24551 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm6
24552 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload
24553 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1}
24554 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm12[1],ymm11[3],ymm12[3]
24555 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm10[1],ymm3[3],ymm10[3]
24556 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3]
24557 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3
24558 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24559 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
24560 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm3
24561 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
24562 ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm3
24563 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
24564 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
24565 ; AVX512BW-FCP-NEXT: vpermi2q %zmm11, %zmm12, %zmm2
24566 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1}
24567 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm17[1],ymm23[3],ymm17[3]
24568 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
24569 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
24570 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
24571 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24572 ; AVX512BW-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm3
24573 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24574 ; AVX512BW-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm10
24575 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
24576 ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
24577 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1
24578 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1
24579 ; AVX512BW-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm23
24580 ; AVX512BW-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm2
24581 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24582 ; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2
24583 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24584 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
24585 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
24586 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm2
24587 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
24588 ; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2
24589 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6]
24590 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
24591 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
24592 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24593 ; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm2
24594 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24595 ; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm1
24596 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24597 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
24598 ; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm3
24599 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24600 ; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2
24601 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24602 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
24603 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
24604 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm2
24605 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
24606 ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm2
24607 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm4[0],zmm9[2],zmm4[2],zmm9[4],zmm4[4],zmm9[6],zmm4[6]
24608 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
24609 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
24610 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24611 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2
24612 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24613 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
24614 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24615 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
24616 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3
24617 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24618 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
24619 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24620 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
24621 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
24622 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm2
24623 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2
24624 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm16
24625 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
24626 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm22[0],zmm3[2],zmm22[2],zmm3[4],zmm22[4],zmm3[6],zmm22[6]
24627 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
24628 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
24629 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24630 ; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2
24631 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24632 ; AVX512BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm1
24633 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24634 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
24635 ; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm3
24636 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24637 ; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm2
24638 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24639 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
24640 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
24641 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2
24642 ; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
24643 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm13[0],zmm21[2],zmm13[2],zmm21[4],zmm13[4],zmm21[6],zmm13[6]
24644 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
24645 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
24646 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24647 ; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm2
24648 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24649 ; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1
24650 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24651 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
24652 ; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm3
24653 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24654 ; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm2
24655 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24656 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
24657 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
24658 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
24659 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm2
24660 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
24661 ; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload
24662 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload
24663 ; AVX512BW-FCP-NEXT: # zmm3 = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6]
24664 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
24665 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
24666 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24667 ; AVX512BW-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm3
24668 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24669 ; AVX512BW-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm8
24670 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
24671 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24672 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1
24673 ; AVX512BW-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm3
24674 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24675 ; AVX512BW-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm2
24676 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24677 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
24678 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
24679 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
24680 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2
24681 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
24682 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
24683 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload
24684 ; AVX512BW-FCP-NEXT: # zmm3 = zmm7[0],mem[0],zmm7[2],mem[2],zmm7[4],mem[4],zmm7[6],mem[6]
24685 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
24686 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
24687 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24688 ; AVX512BW-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm2
24689 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24690 ; AVX512BW-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm1
24691 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24692 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
24693 ; AVX512BW-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm3
24694 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24695 ; AVX512BW-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm2
24696 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24697 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
24698 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
24699 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
24700 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2
24701 ; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
24702 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
24703 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm26[0],zmm4[0],zmm26[2],zmm4[2],zmm26[4],zmm4[4],zmm26[6],zmm4[6]
24704 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
24705 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
24706 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24707 ; AVX512BW-FCP-NEXT: vmovdqa64 3776(%rdi), %zmm2
24708 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24709 ; AVX512BW-FCP-NEXT: vmovdqa64 3712(%rdi), %zmm1
24710 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24711 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
24712 ; AVX512BW-FCP-NEXT: vmovdqa64 3648(%rdi), %zmm3
24713 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24714 ; AVX512BW-FCP-NEXT: vmovdqa64 3584(%rdi), %zmm2
24715 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24716 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
24717 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
24718 ; AVX512BW-FCP-NEXT: vpermi2q %zmm18, %zmm29, %zmm0
24719 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm12[0],zmm11[0],zmm12[2],zmm11[2],zmm12[4],zmm11[4],zmm12[6],zmm11[6]
24720 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
24721 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0
24722 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24723 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
24724 ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
24725 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24726 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
24727 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm10
24728 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
24729 ; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2
24730 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7]
24731 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm2
24732 ; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2
24733 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
24734 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
24735 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
24736 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
24737 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
24738 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24739 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24740 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
24741 ; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm1
24742 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
24743 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
24744 ; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
24745 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
24746 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm29
24747 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm2
24748 ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm2
24749 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
24750 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
24751 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7]
24752 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
24753 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
24754 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24755 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24756 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
24757 ; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1
24758 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
24759 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
24760 ; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2
24761 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
24762 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
24763 ; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
24764 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
24765 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
24766 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm12[1],zmm18[1],zmm12[3],zmm18[3],zmm12[5],zmm18[5],zmm12[7],zmm18[7]
24767 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
24768 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
24769 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24770 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24771 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
24772 ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
24773 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
24774 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
24775 ; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
24776 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
24777 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2
24778 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
24779 ; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2
24780 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
24781 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm13[1],zmm3[3],zmm13[3],zmm3[5],zmm13[5],zmm3[7],zmm13[7]
24782 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
24783 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
24784 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24785 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24786 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
24787 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1
24788 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
24789 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
24790 ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm2
24791 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
24792 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload
24793 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload
24794 ; AVX512BW-FCP-NEXT: # zmm3 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7]
24795 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1}
24796 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
24797 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24798 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
24799 ; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm8
24800 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
24801 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
24802 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
24803 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm8[4,5,6,7]
24804 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload
24805 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload
24806 ; AVX512BW-FCP-NEXT: # zmm3 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7]
24807 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1}
24808 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
24809 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24810 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24811 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
24812 ; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm1
24813 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
24814 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2
24815 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
24816 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm2
24817 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
24818 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
24819 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
24820 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7]
24821 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1}
24822 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
24823 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24824 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
24825 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1
24826 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
24827 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm1
24828 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
24829 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2
24830 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
24831 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2
24832 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
24833 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
24834 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
24835 ; AVX512BW-FCP-NEXT: vpermi2q %zmm10, %zmm9, %zmm0
24836 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
24837 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload
24838 ; AVX512BW-FCP-NEXT: # zmm2 = zmm2[1],mem[1],zmm2[3],mem[3],zmm2[5],mem[5],zmm2[7],mem[7]
24839 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
24840 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0
24841 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24842 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
24843 ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
24844 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24845 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm1
24846 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
24847 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
24848 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
24849 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm2
24850 ; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2
24851 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
24852 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 {%k1} # 64-byte Folded Reload
24853 ; AVX512BW-FCP-NEXT: # zmm2 {%k1} = zmm25[0],mem[0],zmm25[2],mem[2],zmm25[4],mem[4],zmm25[6],mem[6]
24854 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
24855 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24856 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24857 ; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm1
24858 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
24859 ; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
24860 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
24861 ; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm0, %zmm15
24862 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm29[0],zmm30[0],zmm29[2],zmm30[2],zmm29[4],zmm30[4],zmm29[6],zmm30[6]
24863 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1
24864 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24865 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24866 ; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1
24867 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
24868 ; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2
24869 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
24870 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm2
24871 ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2
24872 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
24873 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
24874 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm30[0],zmm26[0],zmm30[2],zmm26[2],zmm30[4],zmm26[4],zmm30[6],zmm26[6]
24875 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
24876 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24877 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24878 ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
24879 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
24880 ; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
24881 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
24882 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
24883 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm2
24884 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
24885 ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm2
24886 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
24887 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm24[0],zmm23[0],zmm24[2],zmm23[2],zmm24[4],zmm23[4],zmm24[6],zmm23[6]
24888 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
24889 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24890 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24891 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1
24892 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
24893 ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm2
24894 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
24895 ; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
24896 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
24897 ; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2
24898 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
24899 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
24900 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
24901 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
24902 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24903 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24904 ; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm1
24905 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
24906 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
24907 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
24908 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
24909 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
24910 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
24911 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm2 {%k1} # 64-byte Folded Reload
24912 ; AVX512BW-FCP-NEXT: # zmm2 {%k1} = zmm11[0],mem[0],zmm11[2],mem[2],zmm11[4],mem[4],zmm11[6],mem[6]
24913 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
24914 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24915 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24916 ; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm1
24917 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm7
24918 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
24919 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
24920 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm2
24921 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
24922 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
24923 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
24924 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm28[0],zmm7[2],zmm28[2],zmm7[4],zmm28[4],zmm7[6],zmm28[6]
24925 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
24926 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24927 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm3
24928 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm4
24929 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7]
24930 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
24931 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
24932 ; AVX512BW-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm0
24933 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6]
24934 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
24935 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24936 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm1
24937 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm0
24938 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15]
24939 ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
24940 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
24941 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm19
24942 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
24943 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
24944 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7]
24945 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24946 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm6
24947 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8]
24948 ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
24949 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm17
24950 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24951 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9]
24952 ; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
24953 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm6
24954 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24955 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm1
24956 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24957 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm0
24958 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24959 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24960 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20
24961 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0
24962 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
24963 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm1
24964 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
24965 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
24966 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm11[1],zmm13[3],zmm11[3],zmm13[5],zmm11[5],zmm13[7],zmm11[7]
24967 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24968 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1
24969 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm12, %zmm13
24970 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24971 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm1
24972 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24973 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm20
24974 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm0
24975 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24976 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24977 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17
24978 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0
24979 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
24980 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm5, %zmm1
24981 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm10
24982 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm30[1],zmm26[1],zmm30[3],zmm26[3],zmm30[5],zmm26[5],zmm30[7],zmm26[7]
24983 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24984 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm1
24985 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm26
24986 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm12, %zmm26
24987 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm1
24988 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24989 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm17
24990 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm0
24991 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24992 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm19
24993 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm31
24994 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm3
24995 ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm5, %zmm16
24996 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
24997 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm24[1],zmm6[1],zmm24[3],zmm6[3],zmm24[5],zmm6[5],zmm24[7],zmm6[7]
24998 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24999 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm0
25000 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm29
25001 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm29
25002 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0
25003 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25004 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm19
25005 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm31
25006 ; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
25007 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm16
25008 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27
25009 ; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm1
25010 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7]
25011 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
25012 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1
25013 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm24
25014 ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm24
25015 ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm1
25016 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25017 ; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm12, %zmm16
25018 ; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm2, %zmm27
25019 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
25020 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm18
25021 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30
25022 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
25023 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm1
25024 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25025 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
25026 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm10[1],zmm0[1],zmm10[3],zmm0[3],zmm10[5],zmm0[5],zmm10[7],zmm0[7]
25027 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25028 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1
25029 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm10
25030 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25031 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
25032 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25033 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm18
25034 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm30
25035 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm6
25036 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm23
25037 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
25038 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm6
25039 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
25040 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
25041 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm21[1],zmm28[1],zmm21[3],zmm28[3],zmm21[5],zmm28[5],zmm21[7],zmm28[7]
25042 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm28
25043 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm1
25044 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm21
25045 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
25046 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25047 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm23
25048 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm22
25049 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
25050 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm0
25051 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
25052 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm0
25053 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25054 ; AVX512BW-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm12
25055 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3
25056 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm3
25057 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25058 ; AVX512BW-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm2
25059 ; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm15
25060 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm14[1],zmm8[1],zmm14[3],zmm8[3],zmm14[5],zmm8[5],zmm14[7],zmm8[7]
25061 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm25
25062 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
25063 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload
25064 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25065 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload
25066 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
25067 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload
25068 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
25069 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload
25070 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
25071 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 # 64-byte Folded Reload
25072 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
25073 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload
25074 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
25075 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload
25076 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
25077 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload
25078 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
25079 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload
25080 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
25081 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload
25082 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
25083 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload
25084 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
25085 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
25086 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25087 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
25088 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
25089 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25090 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
25091 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
25092 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25093 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
25094 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
25095 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
25096 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
25097 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
25098 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
25099 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
25100 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25101 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm8[4,5,6,7]
25102 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
25103 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
25104 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25105 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7]
25106 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
25107 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
25108 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25109 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5,6,7]
25110 ; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
25111 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
25112 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
25113 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5,6,7]
25114 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
25115 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
25116 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25117 ; AVX512BW-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload
25118 ; AVX512BW-FCP-NEXT: # ymm0 = mem[0,1,2,3],ymm15[4,5,6,7]
25119 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0
25120 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25121 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25122 ; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
25123 ; AVX512BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
25124 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm25
25125 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm10[4,5,6,7]
25126 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
25127 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
25128 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25129 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1}
25130 ; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm8
25131 ; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm8, %ymm8
25132 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
25133 ; AVX512BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0
25134 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm0[0],ymm8[0],ymm0[2],ymm8[2]
25135 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm28, %zmm17, %zmm26
25136 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
25137 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1}
25138 ; AVX512BW-FCP-NEXT: vmovdqa 576(%rdi), %xmm13
25139 ; AVX512BW-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm13, %ymm13
25140 ; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %xmm28
25141 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, 640(%rdi), %ymm28, %ymm28
25142 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm28[0],ymm13[0],ymm28[2],ymm13[2]
25143 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm1
25144 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25145 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1}
25146 ; AVX512BW-FCP-NEXT: vmovdqa 1088(%rdi), %xmm4
25147 ; AVX512BW-FCP-NEXT: vinserti128 $1, 1216(%rdi), %ymm4, %ymm4
25148 ; AVX512BW-FCP-NEXT: vmovdqa 1024(%rdi), %xmm7
25149 ; AVX512BW-FCP-NEXT: vinserti128 $1, 1152(%rdi), %ymm7, %ymm7
25150 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm7[0],ymm4[0],ymm7[2],ymm4[2]
25151 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm24
25152 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm19 {%k1}
25153 ; AVX512BW-FCP-NEXT: vmovdqa 1600(%rdi), %xmm5
25154 ; AVX512BW-FCP-NEXT: vinserti128 $1, 1728(%rdi), %ymm5, %ymm5
25155 ; AVX512BW-FCP-NEXT: vmovdqa 1536(%rdi), %xmm11
25156 ; AVX512BW-FCP-NEXT: vinserti128 $1, 1664(%rdi), %ymm11, %ymm11
25157 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm11[0],ymm5[0],ymm11[2],ymm5[2]
25158 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm29, %zmm19, %zmm19
25159 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1}
25160 ; AVX512BW-FCP-NEXT: vmovdqa 2112(%rdi), %xmm6
25161 ; AVX512BW-FCP-NEXT: vinserti128 $1, 2240(%rdi), %ymm6, %ymm6
25162 ; AVX512BW-FCP-NEXT: vmovdqa64 2048(%rdi), %xmm29
25163 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm29, %ymm29
25164 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm29[0],ymm6[0],ymm29[2],ymm6[2]
25165 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3
25166 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
25167 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1}
25168 ; AVX512BW-FCP-NEXT: vmovdqa 2624(%rdi), %xmm10
25169 ; AVX512BW-FCP-NEXT: vinserti128 $1, 2752(%rdi), %ymm10, %ymm10
25170 ; AVX512BW-FCP-NEXT: vmovdqa64 2560(%rdi), %xmm23
25171 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm23, %ymm23
25172 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm23[0],ymm10[0],ymm23[2],ymm10[2]
25173 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm18, %zmm18
25174 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
25175 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
25176 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1}
25177 ; AVX512BW-FCP-NEXT: vmovdqa64 3136(%rdi), %xmm17
25178 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, 3264(%rdi), %ymm17, %ymm17
25179 ; AVX512BW-FCP-NEXT: vmovdqa64 3072(%rdi), %xmm20
25180 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20
25181 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm20[0],ymm17[0],ymm20[2],ymm17[2]
25182 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm1, %zmm15
25183 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
25184 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1}
25185 ; AVX512BW-FCP-NEXT: vmovdqa 3648(%rdi), %xmm9
25186 ; AVX512BW-FCP-NEXT: vinserti128 $1, 3776(%rdi), %ymm9, %ymm9
25187 ; AVX512BW-FCP-NEXT: vmovdqa 3584(%rdi), %xmm14
25188 ; AVX512BW-FCP-NEXT: vinserti128 $1, 3712(%rdi), %ymm14, %ymm14
25189 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm14[0],ymm9[0],ymm14[2],ymm9[2]
25190 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm1
25191 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
25192 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
25193 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 {%k1}
25194 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm20[1],ymm17[1],ymm20[3],ymm17[3]
25195 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm16
25196 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
25197 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
25198 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1}
25199 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm28[1],ymm13[1],ymm28[3],ymm13[3]
25200 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm12, %zmm13
25201 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
25202 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
25203 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1}
25204 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3]
25205 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0
25206 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
25207 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm31 {%k1}
25208 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3]
25209 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm5
25210 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
25211 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1}
25212 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],ymm4[1],ymm7[3],ymm4[3]
25213 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4
25214 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
25215 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1}
25216 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm10[1],ymm23[3],ymm10[3]
25217 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm30, %zmm7
25218 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
25219 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1}
25220 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm29[1],ymm6[1],ymm29[3],ymm6[3]
25221 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6
25222 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
25223 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1}
25224 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm14[1],ymm9[1],ymm14[3],ymm9[3]
25225 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2
25226 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 448(%rsi)
25227 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 384(%rsi)
25228 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 320(%rsi)
25229 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 256(%rsi)
25230 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 192(%rsi)
25231 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 128(%rsi)
25232 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
25233 ; AVX512BW-FCP-NEXT: vmovaps %zmm1, 64(%rsi)
25234 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, (%rsi)
25235 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 448(%rdx)
25236 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 256(%rdx)
25237 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 320(%rdx)
25238 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 128(%rdx)
25239 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 192(%rdx)
25240 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
25241 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rdx)
25242 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 384(%rdx)
25243 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25244 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 448(%rcx)
25245 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25246 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 256(%rcx)
25247 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25248 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 320(%rcx)
25249 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25250 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 128(%rcx)
25251 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25252 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 192(%rcx)
25253 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25254 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%rcx)
25255 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25256 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 64(%rcx)
25257 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25258 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 384(%rcx)
25259 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25260 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 448(%r8)
25261 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25262 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 256(%r8)
25263 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25264 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 320(%r8)
25265 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25266 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 128(%r8)
25267 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25268 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 192(%r8)
25269 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25270 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%r8)
25271 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25272 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 64(%r8)
25273 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25274 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 384(%r8)
25275 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25276 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 448(%r9)
25277 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25278 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 256(%r9)
25279 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25280 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 320(%r9)
25281 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25282 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 128(%r9)
25283 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25284 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 192(%r9)
25285 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25286 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%r9)
25287 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25288 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 64(%r9)
25289 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25290 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 384(%r9)
25291 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
25292 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25293 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 448(%rax)
25294 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25295 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 256(%rax)
25296 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25297 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 320(%rax)
25298 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25299 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 128(%rax)
25300 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25301 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 192(%rax)
25302 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25303 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%rax)
25304 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25305 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 64(%rax)
25306 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25307 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 384(%rax)
25308 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
25309 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25310 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 448(%rax)
25311 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25312 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 256(%rax)
25313 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25314 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 320(%rax)
25315 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25316 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 128(%rax)
25317 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25318 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 192(%rax)
25319 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25320 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%rax)
25321 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25322 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 64(%rax)
25323 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25324 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 384(%rax)
25325 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
25326 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25327 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 384(%rax)
25328 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 448(%rax)
25329 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25330 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 256(%rax)
25331 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25332 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 320(%rax)
25333 ; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
25334 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 128(%rax)
25335 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25336 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 192(%rax)
25337 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25338 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%rax)
25339 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25340 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 64(%rax)
25341 ; AVX512BW-FCP-NEXT: addq $6664, %rsp # imm = 0x1A08
25342 ; AVX512BW-FCP-NEXT: vzeroupper
25343 ; AVX512BW-FCP-NEXT: retq
25345 ; AVX512DQ-BW-LABEL: load_i64_stride8_vf64:
25346 ; AVX512DQ-BW: # %bb.0:
25347 ; AVX512DQ-BW-NEXT: subq $6664, %rsp # imm = 0x1A08
25348 ; AVX512DQ-BW-NEXT: vmovdqa64 3392(%rdi), %zmm3
25349 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25350 ; AVX512DQ-BW-NEXT: vmovdqa64 3328(%rdi), %zmm16
25351 ; AVX512DQ-BW-NEXT: vmovdqa64 3520(%rdi), %zmm8
25352 ; AVX512DQ-BW-NEXT: vmovdqa64 3456(%rdi), %zmm28
25353 ; AVX512DQ-BW-NEXT: vmovdqa64 1856(%rdi), %zmm10
25354 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25355 ; AVX512DQ-BW-NEXT: vmovdqa64 1984(%rdi), %zmm11
25356 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25357 ; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm4
25358 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25359 ; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm5
25360 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25361 ; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm6
25362 ; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm9
25363 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25364 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm12
25365 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25366 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm15
25367 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25368 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm13
25369 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25370 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm14
25371 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25372 ; AVX512DQ-BW-NEXT: movb $-64, %al
25373 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1
25374 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
25375 ; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
25376 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm0
25377 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25378 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm0
25379 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25380 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm1
25381 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
25382 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
25383 ; AVX512DQ-BW-NEXT: vmovdqa 3264(%rdi), %ymm3
25384 ; AVX512DQ-BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
25385 ; AVX512DQ-BW-NEXT: vmovdqa 3200(%rdi), %ymm0
25386 ; AVX512DQ-BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
25387 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
25388 ; AVX512DQ-BW-NEXT: vmovdqa 3136(%rdi), %ymm3
25389 ; AVX512DQ-BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
25390 ; AVX512DQ-BW-NEXT: vmovdqa 3072(%rdi), %ymm7
25391 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2]
25392 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
25393 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
25394 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25395 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm0
25396 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0
25397 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm9
25398 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25399 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1
25400 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm1
25401 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
25402 ; AVX512DQ-BW-NEXT: vmovdqa 704(%rdi), %ymm0
25403 ; AVX512DQ-BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
25404 ; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %ymm20
25405 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm20[0],ymm0[0],ymm20[2],ymm0[2]
25406 ; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %ymm22
25407 ; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %ymm19
25408 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2]
25409 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
25410 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
25411 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25412 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm0
25413 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm0
25414 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm1
25415 ; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm1
25416 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
25417 ; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm14
25418 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm15
25419 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
25420 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %ymm21
25421 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm13
25422 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2]
25423 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
25424 ; AVX512DQ-BW-NEXT: vmovdqa64 1920(%rdi), %zmm3
25425 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25426 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
25427 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25428 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0
25429 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm0
25430 ; AVX512DQ-BW-NEXT: vmovdqa64 1792(%rdi), %zmm1
25431 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25432 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm1
25433 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
25434 ; AVX512DQ-BW-NEXT: vmovdqa 1728(%rdi), %ymm3
25435 ; AVX512DQ-BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
25436 ; AVX512DQ-BW-NEXT: vmovdqa 1664(%rdi), %ymm0
25437 ; AVX512DQ-BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
25438 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
25439 ; AVX512DQ-BW-NEXT: vmovdqa 1600(%rdi), %ymm4
25440 ; AVX512DQ-BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
25441 ; AVX512DQ-BW-NEXT: vmovdqa 1536(%rdi), %ymm3
25442 ; AVX512DQ-BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
25443 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
25444 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
25445 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
25446 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25447 ; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm1
25448 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25449 ; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm0
25450 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25451 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
25452 ; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm3
25453 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25454 ; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm1
25455 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
25456 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
25457 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
25458 ; AVX512DQ-BW-NEXT: vmovdqa 1216(%rdi), %ymm3
25459 ; AVX512DQ-BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
25460 ; AVX512DQ-BW-NEXT: vmovdqa 1152(%rdi), %ymm0
25461 ; AVX512DQ-BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
25462 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
25463 ; AVX512DQ-BW-NEXT: vmovdqa 1088(%rdi), %ymm4
25464 ; AVX512DQ-BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
25465 ; AVX512DQ-BW-NEXT: vmovdqa 1024(%rdi), %ymm3
25466 ; AVX512DQ-BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
25467 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
25468 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
25469 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
25470 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25471 ; AVX512DQ-BW-NEXT: vmovdqa64 3008(%rdi), %zmm1
25472 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25473 ; AVX512DQ-BW-NEXT: vmovdqa64 2944(%rdi), %zmm29
25474 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm0
25475 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25476 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
25477 ; AVX512DQ-BW-NEXT: vmovdqa64 2880(%rdi), %zmm24
25478 ; AVX512DQ-BW-NEXT: vmovdqa64 2816(%rdi), %zmm25
25479 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm1
25480 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25481 ; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm1
25482 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25483 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
25484 ; AVX512DQ-BW-NEXT: vmovdqa64 2752(%rdi), %ymm27
25485 ; AVX512DQ-BW-NEXT: vmovdqa64 2688(%rdi), %ymm26
25486 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm27[0],ymm26[2],ymm27[2]
25487 ; AVX512DQ-BW-NEXT: vmovdqa64 2624(%rdi), %ymm30
25488 ; AVX512DQ-BW-NEXT: vmovdqa64 2560(%rdi), %ymm18
25489 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm30[0],ymm18[2],ymm30[2]
25490 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
25491 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
25492 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25493 ; AVX512DQ-BW-NEXT: vmovdqa64 2496(%rdi), %zmm1
25494 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25495 ; AVX512DQ-BW-NEXT: vmovdqa64 2432(%rdi), %zmm31
25496 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm0
25497 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25498 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
25499 ; AVX512DQ-BW-NEXT: vmovdqa64 2368(%rdi), %zmm3
25500 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25501 ; AVX512DQ-BW-NEXT: vmovdqa64 2304(%rdi), %zmm1
25502 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25503 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
25504 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
25505 ; AVX512DQ-BW-NEXT: vmovdqa 2240(%rdi), %ymm12
25506 ; AVX512DQ-BW-NEXT: vmovdqa 2176(%rdi), %ymm11
25507 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2]
25508 ; AVX512DQ-BW-NEXT: vmovdqa 2112(%rdi), %ymm10
25509 ; AVX512DQ-BW-NEXT: vmovdqa 2048(%rdi), %ymm3
25510 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm10[0],ymm3[2],ymm10[2]
25511 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3]
25512 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
25513 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25514 ; AVX512DQ-BW-NEXT: vmovdqa64 4032(%rdi), %zmm1
25515 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25516 ; AVX512DQ-BW-NEXT: vmovdqa64 3968(%rdi), %zmm0
25517 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25518 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
25519 ; AVX512DQ-BW-NEXT: vmovdqa64 3904(%rdi), %zmm1
25520 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25521 ; AVX512DQ-BW-NEXT: vmovdqa64 3840(%rdi), %zmm4
25522 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25523 ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm2
25524 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
25525 ; AVX512DQ-BW-NEXT: vmovdqa64 3776(%rdi), %ymm17
25526 ; AVX512DQ-BW-NEXT: vmovdqa64 3712(%rdi), %ymm23
25527 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm23[0],ymm17[0],ymm23[2],ymm17[2]
25528 ; AVX512DQ-BW-NEXT: vmovdqa 3648(%rdi), %ymm1
25529 ; AVX512DQ-BW-NEXT: vmovdqa 3584(%rdi), %ymm0
25530 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
25531 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3]
25532 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
25533 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25534 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
25535 ; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
25536 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm28
25537 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm8
25538 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25539 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm6
25540 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
25541 ; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm6
25542 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm6 {%k1}
25543 ; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
25544 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload
25545 ; AVX512DQ-BW-NEXT: # ymm5 = ymm4[1],mem[1],ymm4[3],mem[3]
25546 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
25547 ; AVX512DQ-BW-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
25548 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
25549 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
25550 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25551 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
25552 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm5
25553 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm5
25554 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
25555 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm6
25556 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
25557 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm6
25558 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
25559 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload
25560 ; AVX512DQ-BW-NEXT: # ymm5 = ymm20[1],mem[1],ymm20[3],mem[3]
25561 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm19[1],ymm22[1],ymm19[3],ymm22[3]
25562 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
25563 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
25564 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25565 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
25566 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm5
25567 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
25568 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm5
25569 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
25570 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
25571 ; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm2, %zmm6
25572 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
25573 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
25574 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
25575 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
25576 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
25577 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25578 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
25579 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm5
25580 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
25581 ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm5
25582 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
25583 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm6
25584 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
25585 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm6
25586 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
25587 ; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
25588 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
25589 ; AVX512DQ-BW-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
25590 ; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
25591 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
25592 ; AVX512DQ-BW-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
25593 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
25594 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
25595 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25596 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
25597 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload
25598 ; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
25599 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload
25600 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
25601 ; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
25602 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
25603 ; AVX512DQ-BW-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
25604 ; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
25605 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
25606 ; AVX512DQ-BW-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
25607 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
25608 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
25609 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25610 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm29 # 64-byte Folded Reload
25611 ; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm25
25612 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1}
25613 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm27[1],ymm26[3],ymm27[3]
25614 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm18[1],ymm30[1],ymm18[3],ymm30[3]
25615 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
25616 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm25, %zmm5
25617 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25618 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
25619 ; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm31
25620 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
25621 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm6
25622 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload
25623 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1}
25624 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm12[1],ymm11[3],ymm12[3]
25625 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm10[1],ymm3[3],ymm10[3]
25626 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3]
25627 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3
25628 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25629 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
25630 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm3
25631 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
25632 ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm3
25633 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
25634 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
25635 ; AVX512DQ-BW-NEXT: vpermi2q %zmm11, %zmm12, %zmm2
25636 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1}
25637 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm17[1],ymm23[3],ymm17[3]
25638 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
25639 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
25640 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
25641 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25642 ; AVX512DQ-BW-NEXT: vmovdqa64 3264(%rdi), %zmm3
25643 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25644 ; AVX512DQ-BW-NEXT: vmovdqa64 3200(%rdi), %zmm10
25645 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
25646 ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
25647 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm1
25648 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1
25649 ; AVX512DQ-BW-NEXT: vmovdqa64 3136(%rdi), %zmm23
25650 ; AVX512DQ-BW-NEXT: vmovdqa64 3072(%rdi), %zmm2
25651 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25652 ; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2
25653 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25654 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
25655 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
25656 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm2
25657 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
25658 ; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2
25659 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6]
25660 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
25661 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
25662 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25663 ; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm2
25664 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25665 ; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm1
25666 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25667 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
25668 ; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm3
25669 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25670 ; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm2
25671 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25672 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
25673 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
25674 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm2
25675 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
25676 ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2
25677 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm4[0],zmm9[2],zmm4[2],zmm9[4],zmm4[4],zmm9[6],zmm4[6]
25678 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
25679 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
25680 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25681 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm2
25682 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25683 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm1
25684 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25685 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
25686 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm3
25687 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25688 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2
25689 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25690 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
25691 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
25692 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm2
25693 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2
25694 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm16
25695 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
25696 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm22[0],zmm3[2],zmm22[2],zmm3[4],zmm22[4],zmm3[6],zmm22[6]
25697 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
25698 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
25699 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25700 ; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm2
25701 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25702 ; AVX512DQ-BW-NEXT: vmovdqa64 1664(%rdi), %zmm1
25703 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25704 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
25705 ; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm3
25706 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25707 ; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %zmm2
25708 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25709 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
25710 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
25711 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm2
25712 ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
25713 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm13[0],zmm21[2],zmm13[2],zmm21[4],zmm13[4],zmm21[6],zmm13[6]
25714 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
25715 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
25716 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25717 ; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm2
25718 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25719 ; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm1
25720 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25721 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
25722 ; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm3
25723 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25724 ; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm2
25725 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25726 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
25727 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
25728 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
25729 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm2
25730 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
25731 ; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload
25732 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload
25733 ; AVX512DQ-BW-NEXT: # zmm3 = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6]
25734 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
25735 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
25736 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25737 ; AVX512DQ-BW-NEXT: vmovdqa64 2752(%rdi), %zmm3
25738 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25739 ; AVX512DQ-BW-NEXT: vmovdqa64 2688(%rdi), %zmm8
25740 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1
25741 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25742 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1
25743 ; AVX512DQ-BW-NEXT: vmovdqa64 2624(%rdi), %zmm3
25744 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25745 ; AVX512DQ-BW-NEXT: vmovdqa64 2560(%rdi), %zmm2
25746 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25747 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
25748 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
25749 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
25750 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm2
25751 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
25752 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
25753 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload
25754 ; AVX512DQ-BW-NEXT: # zmm3 = zmm7[0],mem[0],zmm7[2],mem[2],zmm7[4],mem[4],zmm7[6],mem[6]
25755 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
25756 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
25757 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25758 ; AVX512DQ-BW-NEXT: vmovdqa64 2240(%rdi), %zmm2
25759 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25760 ; AVX512DQ-BW-NEXT: vmovdqa64 2176(%rdi), %zmm1
25761 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25762 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
25763 ; AVX512DQ-BW-NEXT: vmovdqa64 2112(%rdi), %zmm3
25764 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25765 ; AVX512DQ-BW-NEXT: vmovdqa64 2048(%rdi), %zmm2
25766 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25767 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
25768 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
25769 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
25770 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm2
25771 ; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
25772 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
25773 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm26[0],zmm4[0],zmm26[2],zmm4[2],zmm26[4],zmm4[4],zmm26[6],zmm4[6]
25774 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
25775 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
25776 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25777 ; AVX512DQ-BW-NEXT: vmovdqa64 3776(%rdi), %zmm2
25778 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25779 ; AVX512DQ-BW-NEXT: vmovdqa64 3712(%rdi), %zmm1
25780 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25781 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
25782 ; AVX512DQ-BW-NEXT: vmovdqa64 3648(%rdi), %zmm3
25783 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25784 ; AVX512DQ-BW-NEXT: vmovdqa64 3584(%rdi), %zmm2
25785 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25786 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
25787 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
25788 ; AVX512DQ-BW-NEXT: vpermi2q %zmm18, %zmm29, %zmm0
25789 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm12[0],zmm11[0],zmm12[2],zmm11[2],zmm12[4],zmm11[4],zmm12[6],zmm11[6]
25790 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
25791 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0
25792 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25793 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
25794 ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
25795 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25796 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
25797 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm10
25798 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
25799 ; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2
25800 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7]
25801 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm2
25802 ; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2
25803 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
25804 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
25805 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
25806 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
25807 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
25808 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25809 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
25810 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
25811 ; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm1
25812 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
25813 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
25814 ; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
25815 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
25816 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm29
25817 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm2
25818 ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2
25819 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
25820 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
25821 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7]
25822 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
25823 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
25824 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25825 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
25826 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
25827 ; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1
25828 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
25829 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
25830 ; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2
25831 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
25832 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
25833 ; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
25834 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
25835 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
25836 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm12[1],zmm18[1],zmm12[3],zmm18[3],zmm12[5],zmm18[5],zmm12[7],zmm18[7]
25837 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
25838 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
25839 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25840 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
25841 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
25842 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
25843 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
25844 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
25845 ; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
25846 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
25847 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm2
25848 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
25849 ; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2
25850 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
25851 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm13[1],zmm3[3],zmm13[3],zmm3[5],zmm13[5],zmm3[7],zmm13[7]
25852 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
25853 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
25854 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25855 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
25856 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
25857 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm1
25858 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
25859 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
25860 ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2
25861 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
25862 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload
25863 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload
25864 ; AVX512DQ-BW-NEXT: # zmm3 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7]
25865 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1}
25866 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
25867 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25868 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
25869 ; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm8
25870 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
25871 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
25872 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
25873 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm8[4,5,6,7]
25874 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload
25875 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload
25876 ; AVX512DQ-BW-NEXT: # zmm3 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7]
25877 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1}
25878 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
25879 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25880 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
25881 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
25882 ; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm1
25883 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
25884 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm2
25885 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
25886 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm2
25887 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
25888 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
25889 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
25890 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7]
25891 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1}
25892 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
25893 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25894 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
25895 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1
25896 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
25897 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1
25898 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
25899 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm2
25900 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
25901 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2
25902 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
25903 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
25904 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
25905 ; AVX512DQ-BW-NEXT: vpermi2q %zmm10, %zmm9, %zmm0
25906 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
25907 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload
25908 ; AVX512DQ-BW-NEXT: # zmm2 = zmm2[1],mem[1],zmm2[3],mem[3],zmm2[5],mem[5],zmm2[7],mem[7]
25909 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
25910 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0
25911 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25912 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
25913 ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
25914 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
25915 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm1
25916 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
25917 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
25918 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
25919 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm2
25920 ; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2
25921 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
25922 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 {%k1} # 64-byte Folded Reload
25923 ; AVX512DQ-BW-NEXT: # zmm2 {%k1} = zmm25[0],mem[0],zmm25[2],mem[2],zmm25[4],mem[4],zmm25[6],mem[6]
25924 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
25925 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25926 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
25927 ; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm1
25928 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
25929 ; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
25930 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
25931 ; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm15
25932 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm29[0],zmm30[0],zmm29[2],zmm30[2],zmm29[4],zmm30[4],zmm29[6],zmm30[6]
25933 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1
25934 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25935 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
25936 ; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1
25937 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
25938 ; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2
25939 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
25940 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm2
25941 ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2
25942 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
25943 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
25944 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm30[0],zmm26[0],zmm30[2],zmm26[2],zmm30[4],zmm26[4],zmm30[6],zmm26[6]
25945 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
25946 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25947 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
25948 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
25949 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
25950 ; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
25951 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
25952 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
25953 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm2
25954 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
25955 ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm2
25956 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
25957 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm24[0],zmm23[0],zmm24[2],zmm23[2],zmm24[4],zmm23[4],zmm24[6],zmm23[6]
25958 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
25959 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25960 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
25961 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm1
25962 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
25963 ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2
25964 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
25965 ; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
25966 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
25967 ; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2
25968 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
25969 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
25970 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
25971 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
25972 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25973 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
25974 ; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm1
25975 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
25976 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
25977 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
25978 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
25979 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
25980 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
25981 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm2 {%k1} # 64-byte Folded Reload
25982 ; AVX512DQ-BW-NEXT: # zmm2 {%k1} = zmm11[0],mem[0],zmm11[2],mem[2],zmm11[4],mem[4],zmm11[6],mem[6]
25983 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
25984 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25985 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
25986 ; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm1
25987 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm7
25988 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
25989 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
25990 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm2
25991 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
25992 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
25993 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
25994 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm28[0],zmm7[2],zmm28[2],zmm7[4],zmm28[4],zmm7[6],zmm28[6]
25995 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
25996 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25997 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3
25998 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm4
25999 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7]
26000 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
26001 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
26002 ; AVX512DQ-BW-NEXT: vpermi2q %zmm25, %zmm15, %zmm0
26003 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6]
26004 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
26005 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26006 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm1
26007 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm0
26008 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15]
26009 ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
26010 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
26011 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm19
26012 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
26013 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
26014 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7]
26015 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26016 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm6
26017 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8]
26018 ; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
26019 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm17
26020 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26021 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9]
26022 ; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
26023 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm6
26024 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26025 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm1
26026 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26027 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0
26028 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26029 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
26030 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm20
26031 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0
26032 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
26033 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm1
26034 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
26035 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
26036 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm11[1],zmm13[3],zmm11[3],zmm13[5],zmm11[5],zmm13[7],zmm11[7]
26037 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26038 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm1
26039 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm12, %zmm13
26040 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26041 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm1
26042 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26043 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm20
26044 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm0
26045 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26046 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
26047 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm17
26048 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0
26049 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
26050 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm5, %zmm1
26051 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm10
26052 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm30[1],zmm26[1],zmm30[3],zmm26[3],zmm30[5],zmm26[5],zmm30[7],zmm26[7]
26053 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26054 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm1
26055 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm26
26056 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm12, %zmm26
26057 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm1
26058 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26059 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm12, %zmm17
26060 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0
26061 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26062 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm19
26063 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm31
26064 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm3
26065 ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm5, %zmm16
26066 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
26067 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm24[1],zmm6[1],zmm24[3],zmm6[3],zmm24[5],zmm6[5],zmm24[7],zmm6[7]
26068 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26069 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm0
26070 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm29
26071 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm29
26072 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0
26073 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26074 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm19
26075 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm31
26076 ; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
26077 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm16
26078 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm27
26079 ; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm5, %zmm1
26080 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7]
26081 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
26082 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm1
26083 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm24
26084 ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm24
26085 ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm1
26086 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26087 ; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm12, %zmm16
26088 ; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm2, %zmm27
26089 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
26090 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm18
26091 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm30
26092 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
26093 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm1
26094 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26095 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
26096 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm10[1],zmm0[1],zmm10[3],zmm0[3],zmm10[5],zmm0[5],zmm10[7],zmm0[7]
26097 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26098 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm1
26099 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm10
26100 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26101 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
26102 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26103 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm18
26104 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm30
26105 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm6
26106 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm23
26107 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
26108 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm6
26109 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm0
26110 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
26111 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm21[1],zmm28[1],zmm21[3],zmm28[3],zmm21[5],zmm28[5],zmm21[7],zmm28[7]
26112 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm28
26113 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm1
26114 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm21
26115 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
26116 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26117 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm23
26118 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm22
26119 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
26120 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm0
26121 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
26122 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm0
26123 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26124 ; AVX512DQ-BW-NEXT: vpermi2q %zmm25, %zmm15, %zmm12
26125 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm3
26126 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm3
26127 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26128 ; AVX512DQ-BW-NEXT: vpermi2q %zmm25, %zmm15, %zmm2
26129 ; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm5, %zmm15
26130 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm14[1],zmm8[1],zmm14[3],zmm8[3],zmm14[5],zmm8[5],zmm14[7],zmm8[7]
26131 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm25
26132 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
26133 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload
26134 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26135 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload
26136 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
26137 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload
26138 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
26139 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload
26140 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
26141 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 # 64-byte Folded Reload
26142 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
26143 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload
26144 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
26145 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload
26146 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
26147 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload
26148 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
26149 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload
26150 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
26151 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload
26152 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
26153 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload
26154 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
26155 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
26156 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26157 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
26158 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
26159 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26160 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
26161 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
26162 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26163 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
26164 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
26165 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
26166 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
26167 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
26168 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
26169 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
26170 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26171 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm8[4,5,6,7]
26172 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
26173 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
26174 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26175 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7]
26176 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
26177 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
26178 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26179 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5,6,7]
26180 ; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
26181 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
26182 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
26183 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5,6,7]
26184 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
26185 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
26186 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26187 ; AVX512DQ-BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload
26188 ; AVX512DQ-BW-NEXT: # ymm0 = mem[0,1,2,3],ymm15[4,5,6,7]
26189 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0
26190 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26191 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26192 ; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
26193 ; AVX512DQ-BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
26194 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm25
26195 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm10[4,5,6,7]
26196 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
26197 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
26198 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26199 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1}
26200 ; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm8
26201 ; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rdi), %ymm8, %ymm8
26202 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
26203 ; AVX512DQ-BW-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0
26204 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm0[0],ymm8[0],ymm0[2],ymm8[2]
26205 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm28, %zmm17, %zmm26
26206 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
26207 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1}
26208 ; AVX512DQ-BW-NEXT: vmovdqa 576(%rdi), %xmm13
26209 ; AVX512DQ-BW-NEXT: vinserti128 $1, 704(%rdi), %ymm13, %ymm13
26210 ; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %xmm28
26211 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, 640(%rdi), %ymm28, %ymm28
26212 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm28[0],ymm13[0],ymm28[2],ymm13[2]
26213 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm1
26214 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26215 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1}
26216 ; AVX512DQ-BW-NEXT: vmovdqa 1088(%rdi), %xmm4
26217 ; AVX512DQ-BW-NEXT: vinserti128 $1, 1216(%rdi), %ymm4, %ymm4
26218 ; AVX512DQ-BW-NEXT: vmovdqa 1024(%rdi), %xmm7
26219 ; AVX512DQ-BW-NEXT: vinserti128 $1, 1152(%rdi), %ymm7, %ymm7
26220 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm7[0],ymm4[0],ymm7[2],ymm4[2]
26221 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm24
26222 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm19 {%k1}
26223 ; AVX512DQ-BW-NEXT: vmovdqa 1600(%rdi), %xmm5
26224 ; AVX512DQ-BW-NEXT: vinserti128 $1, 1728(%rdi), %ymm5, %ymm5
26225 ; AVX512DQ-BW-NEXT: vmovdqa 1536(%rdi), %xmm11
26226 ; AVX512DQ-BW-NEXT: vinserti128 $1, 1664(%rdi), %ymm11, %ymm11
26227 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm11[0],ymm5[0],ymm11[2],ymm5[2]
26228 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm29, %zmm19, %zmm19
26229 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1}
26230 ; AVX512DQ-BW-NEXT: vmovdqa 2112(%rdi), %xmm6
26231 ; AVX512DQ-BW-NEXT: vinserti128 $1, 2240(%rdi), %ymm6, %ymm6
26232 ; AVX512DQ-BW-NEXT: vmovdqa64 2048(%rdi), %xmm29
26233 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm29, %ymm29
26234 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm29[0],ymm6[0],ymm29[2],ymm6[2]
26235 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3
26236 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
26237 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1}
26238 ; AVX512DQ-BW-NEXT: vmovdqa 2624(%rdi), %xmm10
26239 ; AVX512DQ-BW-NEXT: vinserti128 $1, 2752(%rdi), %ymm10, %ymm10
26240 ; AVX512DQ-BW-NEXT: vmovdqa64 2560(%rdi), %xmm23
26241 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm23, %ymm23
26242 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm23[0],ymm10[0],ymm23[2],ymm10[2]
26243 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm20, %zmm18, %zmm18
26244 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
26245 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
26246 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1}
26247 ; AVX512DQ-BW-NEXT: vmovdqa64 3136(%rdi), %xmm17
26248 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, 3264(%rdi), %ymm17, %ymm17
26249 ; AVX512DQ-BW-NEXT: vmovdqa64 3072(%rdi), %xmm20
26250 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20
26251 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm20[0],ymm17[0],ymm20[2],ymm17[2]
26252 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm15, %zmm1, %zmm15
26253 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
26254 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1}
26255 ; AVX512DQ-BW-NEXT: vmovdqa 3648(%rdi), %xmm9
26256 ; AVX512DQ-BW-NEXT: vinserti128 $1, 3776(%rdi), %ymm9, %ymm9
26257 ; AVX512DQ-BW-NEXT: vmovdqa 3584(%rdi), %xmm14
26258 ; AVX512DQ-BW-NEXT: vinserti128 $1, 3712(%rdi), %ymm14, %ymm14
26259 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm14[0],ymm9[0],ymm14[2],ymm9[2]
26260 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm1
26261 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
26262 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
26263 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm12 {%k1}
26264 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm20[1],ymm17[1],ymm20[3],ymm17[3]
26265 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm16
26266 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
26267 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
26268 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1}
26269 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm28[1],ymm13[1],ymm28[3],ymm13[3]
26270 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm12, %zmm13
26271 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
26272 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
26273 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1}
26274 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3]
26275 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0
26276 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
26277 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm31 {%k1}
26278 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3]
26279 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm5
26280 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
26281 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1}
26282 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],ymm4[1],ymm7[3],ymm4[3]
26283 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4
26284 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
26285 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1}
26286 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm10[1],ymm23[3],ymm10[3]
26287 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm30, %zmm7
26288 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
26289 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1}
26290 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm29[1],ymm6[1],ymm29[3],ymm6[3]
26291 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6
26292 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
26293 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1}
26294 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm14[1],ymm9[1],ymm14[3],ymm9[3]
26295 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2
26296 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 448(%rsi)
26297 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 384(%rsi)
26298 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 320(%rsi)
26299 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 256(%rsi)
26300 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 192(%rsi)
26301 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 128(%rsi)
26302 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
26303 ; AVX512DQ-BW-NEXT: vmovaps %zmm1, 64(%rsi)
26304 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, (%rsi)
26305 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 448(%rdx)
26306 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 256(%rdx)
26307 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 320(%rdx)
26308 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 128(%rdx)
26309 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 192(%rdx)
26310 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rdx)
26311 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 64(%rdx)
26312 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 384(%rdx)
26313 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26314 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 448(%rcx)
26315 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26316 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 256(%rcx)
26317 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26318 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 320(%rcx)
26319 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26320 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 128(%rcx)
26321 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26322 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 192(%rcx)
26323 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26324 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%rcx)
26325 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26326 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 64(%rcx)
26327 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26328 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 384(%rcx)
26329 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26330 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 448(%r8)
26331 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26332 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 256(%r8)
26333 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26334 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 320(%r8)
26335 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26336 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 128(%r8)
26337 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26338 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 192(%r8)
26339 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26340 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%r8)
26341 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26342 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 64(%r8)
26343 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26344 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 384(%r8)
26345 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26346 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 448(%r9)
26347 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26348 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 256(%r9)
26349 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26350 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 320(%r9)
26351 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26352 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 128(%r9)
26353 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26354 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 192(%r9)
26355 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26356 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%r9)
26357 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26358 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 64(%r9)
26359 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26360 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 384(%r9)
26361 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
26362 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26363 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 448(%rax)
26364 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26365 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 256(%rax)
26366 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26367 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 320(%rax)
26368 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26369 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 128(%rax)
26370 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26371 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 192(%rax)
26372 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26373 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%rax)
26374 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26375 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 64(%rax)
26376 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26377 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 384(%rax)
26378 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
26379 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26380 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 448(%rax)
26381 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26382 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 256(%rax)
26383 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26384 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 320(%rax)
26385 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26386 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 128(%rax)
26387 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26388 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 192(%rax)
26389 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26390 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%rax)
26391 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26392 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 64(%rax)
26393 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26394 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 384(%rax)
26395 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
26396 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26397 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 384(%rax)
26398 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 448(%rax)
26399 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26400 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 256(%rax)
26401 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26402 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 320(%rax)
26403 ; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
26404 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 128(%rax)
26405 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26406 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 192(%rax)
26407 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26408 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%rax)
26409 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
26410 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 64(%rax)
26411 ; AVX512DQ-BW-NEXT: addq $6664, %rsp # imm = 0x1A08
26412 ; AVX512DQ-BW-NEXT: vzeroupper
26413 ; AVX512DQ-BW-NEXT: retq
26415 ; AVX512DQ-BW-FCP-LABEL: load_i64_stride8_vf64:
26416 ; AVX512DQ-BW-FCP: # %bb.0:
26417 ; AVX512DQ-BW-FCP-NEXT: subq $6664, %rsp # imm = 0x1A08
26418 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm3
26419 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26420 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm16
26421 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm8
26422 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm28
26423 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm10
26424 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26425 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm11
26426 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26427 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4
26428 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26429 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5
26430 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26431 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm6
26432 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm9
26433 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26434 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm12
26435 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26436 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15
26437 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26438 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm13
26439 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26440 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14
26441 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26442 ; AVX512DQ-BW-FCP-NEXT: movb $-64, %al
26443 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
26444 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
26445 ; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
26446 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
26447 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26448 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm0
26449 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26450 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1
26451 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
26452 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
26453 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 3264(%rdi), %ymm3
26454 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
26455 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 3200(%rdi), %ymm0
26456 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
26457 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
26458 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 3136(%rdi), %ymm3
26459 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
26460 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 3072(%rdi), %ymm7
26461 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2]
26462 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
26463 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
26464 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26465 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0
26466 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0
26467 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9
26468 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26469 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1
26470 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm1
26471 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
26472 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 704(%rdi), %ymm0
26473 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
26474 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %ymm20
26475 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm20[0],ymm0[0],ymm20[2],ymm0[2]
26476 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %ymm22
26477 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %ymm19
26478 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2]
26479 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
26480 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
26481 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26482 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm0
26483 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm0
26484 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1
26485 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm1
26486 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
26487 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm14
26488 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm15
26489 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
26490 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm21
26491 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm13
26492 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2]
26493 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
26494 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3
26495 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26496 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
26497 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26498 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0
26499 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm0
26500 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm1
26501 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26502 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm1
26503 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
26504 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1728(%rdi), %ymm3
26505 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
26506 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1664(%rdi), %ymm0
26507 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
26508 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
26509 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1600(%rdi), %ymm4
26510 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
26511 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm3
26512 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
26513 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
26514 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
26515 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
26516 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26517 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm1
26518 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26519 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0
26520 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26521 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
26522 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm3
26523 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26524 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm1
26525 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
26526 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
26527 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
26528 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1216(%rdi), %ymm3
26529 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
26530 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1152(%rdi), %ymm0
26531 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
26532 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
26533 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1088(%rdi), %ymm4
26534 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
26535 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm3
26536 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
26537 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
26538 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
26539 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
26540 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26541 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm1
26542 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26543 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm29
26544 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0
26545 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26546 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
26547 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm24
26548 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm25
26549 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1
26550 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26551 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm1
26552 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26553 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
26554 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2752(%rdi), %ymm27
26555 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2688(%rdi), %ymm26
26556 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm27[0],ymm26[2],ymm27[2]
26557 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2624(%rdi), %ymm30
26558 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2560(%rdi), %ymm18
26559 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm30[0],ymm18[2],ymm30[2]
26560 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
26561 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
26562 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26563 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm1
26564 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26565 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm31
26566 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0
26567 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26568 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
26569 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm3
26570 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26571 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm1
26572 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26573 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
26574 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
26575 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 2240(%rdi), %ymm12
26576 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 2176(%rdi), %ymm11
26577 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2]
26578 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 2112(%rdi), %ymm10
26579 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 2048(%rdi), %ymm3
26580 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm10[0],ymm3[2],ymm10[2]
26581 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3]
26582 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
26583 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26584 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 4032(%rdi), %zmm1
26585 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26586 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3968(%rdi), %zmm0
26587 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26588 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
26589 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3904(%rdi), %zmm1
26590 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26591 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3840(%rdi), %zmm4
26592 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26593 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm2
26594 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
26595 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3776(%rdi), %ymm17
26596 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3712(%rdi), %ymm23
26597 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm23[0],ymm17[0],ymm23[2],ymm17[2]
26598 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 3648(%rdi), %ymm1
26599 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 3584(%rdi), %ymm0
26600 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
26601 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3]
26602 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
26603 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26604 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
26605 ; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
26606 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm28
26607 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm8
26608 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26609 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm6
26610 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
26611 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm6
26612 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm6 {%k1}
26613 ; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
26614 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload
26615 ; AVX512DQ-BW-FCP-NEXT: # ymm5 = ymm4[1],mem[1],ymm4[3],mem[3]
26616 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
26617 ; AVX512DQ-BW-FCP-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
26618 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
26619 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
26620 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26621 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
26622 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm5
26623 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm5
26624 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
26625 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm6
26626 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
26627 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm6
26628 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
26629 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload
26630 ; AVX512DQ-BW-FCP-NEXT: # ymm5 = ymm20[1],mem[1],ymm20[3],mem[3]
26631 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm19[1],ymm22[1],ymm19[3],ymm22[3]
26632 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
26633 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
26634 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26635 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
26636 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm5
26637 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
26638 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm5
26639 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
26640 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
26641 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm2, %zmm6
26642 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
26643 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
26644 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
26645 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
26646 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
26647 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26648 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
26649 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm5
26650 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
26651 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm5
26652 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
26653 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm6
26654 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
26655 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm6
26656 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
26657 ; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
26658 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
26659 ; AVX512DQ-BW-FCP-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
26660 ; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
26661 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
26662 ; AVX512DQ-BW-FCP-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
26663 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
26664 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
26665 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26666 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
26667 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload
26668 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
26669 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload
26670 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
26671 ; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
26672 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
26673 ; AVX512DQ-BW-FCP-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
26674 ; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
26675 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
26676 ; AVX512DQ-BW-FCP-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
26677 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
26678 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
26679 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26680 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm29 # 64-byte Folded Reload
26681 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm25
26682 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1}
26683 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm27[1],ymm26[3],ymm27[3]
26684 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm18[1],ymm30[1],ymm18[3],ymm30[3]
26685 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
26686 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm25, %zmm5
26687 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26688 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
26689 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm31
26690 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
26691 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm6
26692 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload
26693 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1}
26694 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm12[1],ymm11[3],ymm12[3]
26695 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm10[1],ymm3[3],ymm10[3]
26696 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3]
26697 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3
26698 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26699 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
26700 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm3
26701 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
26702 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm3
26703 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
26704 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
26705 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm11, %zmm12, %zmm2
26706 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1}
26707 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm17[1],ymm23[3],ymm17[3]
26708 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
26709 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
26710 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
26711 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26712 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm3
26713 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26714 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm10
26715 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
26716 ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
26717 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1
26718 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1
26719 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm23
26720 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm2
26721 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26722 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2
26723 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26724 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
26725 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
26726 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm2
26727 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
26728 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2
26729 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6]
26730 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
26731 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
26732 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26733 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm2
26734 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26735 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm1
26736 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26737 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
26738 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm3
26739 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26740 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2
26741 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26742 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
26743 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
26744 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm2
26745 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
26746 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm2
26747 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm4[0],zmm9[2],zmm4[2],zmm9[4],zmm4[4],zmm9[6],zmm4[6]
26748 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
26749 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
26750 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26751 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2
26752 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26753 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
26754 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26755 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
26756 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3
26757 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26758 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
26759 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26760 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
26761 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
26762 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm2
26763 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2
26764 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm16
26765 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
26766 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm22[0],zmm3[2],zmm22[2],zmm3[4],zmm22[4],zmm3[6],zmm22[6]
26767 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
26768 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
26769 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26770 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2
26771 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26772 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm1
26773 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26774 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
26775 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm3
26776 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26777 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm2
26778 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26779 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
26780 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
26781 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2
26782 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
26783 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm13[0],zmm21[2],zmm13[2],zmm21[4],zmm13[4],zmm21[6],zmm13[6]
26784 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
26785 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
26786 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26787 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm2
26788 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26789 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1
26790 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26791 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
26792 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm3
26793 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26794 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm2
26795 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26796 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
26797 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
26798 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
26799 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm2
26800 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
26801 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload
26802 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload
26803 ; AVX512DQ-BW-FCP-NEXT: # zmm3 = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6]
26804 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
26805 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
26806 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26807 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm3
26808 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26809 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm8
26810 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
26811 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26812 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1
26813 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm3
26814 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26815 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm2
26816 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26817 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
26818 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
26819 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
26820 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2
26821 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
26822 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
26823 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload
26824 ; AVX512DQ-BW-FCP-NEXT: # zmm3 = zmm7[0],mem[0],zmm7[2],mem[2],zmm7[4],mem[4],zmm7[6],mem[6]
26825 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
26826 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
26827 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26828 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm2
26829 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26830 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm1
26831 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26832 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
26833 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm3
26834 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26835 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm2
26836 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26837 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
26838 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
26839 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
26840 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2
26841 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
26842 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
26843 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm26[0],zmm4[0],zmm26[2],zmm4[2],zmm26[4],zmm4[4],zmm26[6],zmm4[6]
26844 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
26845 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
26846 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26847 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3776(%rdi), %zmm2
26848 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26849 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3712(%rdi), %zmm1
26850 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26851 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
26852 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3648(%rdi), %zmm3
26853 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26854 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3584(%rdi), %zmm2
26855 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26856 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
26857 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
26858 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm18, %zmm29, %zmm0
26859 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm12[0],zmm11[0],zmm12[2],zmm11[2],zmm12[4],zmm11[4],zmm12[6],zmm11[6]
26860 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
26861 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0
26862 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26863 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
26864 ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
26865 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26866 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
26867 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm10
26868 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
26869 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2
26870 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7]
26871 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm2
26872 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2
26873 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
26874 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
26875 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
26876 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
26877 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
26878 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26879 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
26880 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
26881 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm1
26882 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
26883 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
26884 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
26885 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
26886 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm29
26887 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm2
26888 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm2
26889 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
26890 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
26891 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7]
26892 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
26893 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
26894 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26895 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
26896 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
26897 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1
26898 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
26899 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
26900 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2
26901 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
26902 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
26903 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
26904 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
26905 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
26906 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm12[1],zmm18[1],zmm12[3],zmm18[3],zmm12[5],zmm18[5],zmm12[7],zmm18[7]
26907 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
26908 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
26909 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26910 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
26911 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
26912 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
26913 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
26914 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
26915 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
26916 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
26917 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2
26918 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
26919 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2
26920 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
26921 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm13[1],zmm3[3],zmm13[3],zmm3[5],zmm13[5],zmm3[7],zmm13[7]
26922 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
26923 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
26924 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26925 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
26926 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
26927 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1
26928 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
26929 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
26930 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm2
26931 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
26932 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload
26933 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload
26934 ; AVX512DQ-BW-FCP-NEXT: # zmm3 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7]
26935 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1}
26936 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
26937 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26938 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
26939 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm8
26940 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
26941 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
26942 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
26943 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm8[4,5,6,7]
26944 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload
26945 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload
26946 ; AVX512DQ-BW-FCP-NEXT: # zmm3 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7]
26947 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1}
26948 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
26949 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26950 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
26951 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
26952 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm1
26953 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
26954 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2
26955 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
26956 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm2
26957 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
26958 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
26959 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
26960 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7]
26961 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1}
26962 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
26963 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26964 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
26965 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1
26966 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
26967 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm1
26968 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
26969 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2
26970 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
26971 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2
26972 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
26973 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
26974 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
26975 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm10, %zmm9, %zmm0
26976 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
26977 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload
26978 ; AVX512DQ-BW-FCP-NEXT: # zmm2 = zmm2[1],mem[1],zmm2[3],mem[3],zmm2[5],mem[5],zmm2[7],mem[7]
26979 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
26980 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0
26981 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26982 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
26983 ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
26984 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
26985 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm1
26986 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
26987 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
26988 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
26989 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm2
26990 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2
26991 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
26992 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 {%k1} # 64-byte Folded Reload
26993 ; AVX512DQ-BW-FCP-NEXT: # zmm2 {%k1} = zmm25[0],mem[0],zmm25[2],mem[2],zmm25[4],mem[4],zmm25[6],mem[6]
26994 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
26995 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
26996 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
26997 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm1
26998 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
26999 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
27000 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
27001 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm0, %zmm15
27002 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm29[0],zmm30[0],zmm29[2],zmm30[2],zmm29[4],zmm30[4],zmm29[6],zmm30[6]
27003 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1
27004 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27005 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
27006 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1
27007 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
27008 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2
27009 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
27010 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm2
27011 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2
27012 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
27013 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
27014 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm30[0],zmm26[0],zmm30[2],zmm26[2],zmm30[4],zmm26[4],zmm30[6],zmm26[6]
27015 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
27016 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27017 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
27018 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
27019 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
27020 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
27021 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
27022 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
27023 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm2
27024 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
27025 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm2
27026 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
27027 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm24[0],zmm23[0],zmm24[2],zmm23[2],zmm24[4],zmm23[4],zmm24[6],zmm23[6]
27028 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
27029 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27030 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
27031 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1
27032 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
27033 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm2
27034 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
27035 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
27036 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
27037 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2
27038 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
27039 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
27040 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
27041 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
27042 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27043 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
27044 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm1
27045 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
27046 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
27047 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
27048 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
27049 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
27050 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
27051 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm2 {%k1} # 64-byte Folded Reload
27052 ; AVX512DQ-BW-FCP-NEXT: # zmm2 {%k1} = zmm11[0],mem[0],zmm11[2],mem[2],zmm11[4],mem[4],zmm11[6],mem[6]
27053 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
27054 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27055 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
27056 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm1
27057 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm7
27058 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
27059 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
27060 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm2
27061 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
27062 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
27063 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
27064 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm28[0],zmm7[2],zmm28[2],zmm7[4],zmm28[4],zmm7[6],zmm28[6]
27065 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
27066 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27067 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm3
27068 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm4
27069 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7]
27070 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
27071 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
27072 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm0
27073 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6]
27074 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
27075 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27076 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm1
27077 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm0
27078 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15]
27079 ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
27080 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
27081 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm19
27082 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
27083 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
27084 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7]
27085 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27086 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm6
27087 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8]
27088 ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
27089 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm17
27090 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27091 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9]
27092 ; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
27093 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm6
27094 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27095 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm1
27096 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27097 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm0
27098 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27099 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
27100 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20
27101 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0
27102 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
27103 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm1
27104 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
27105 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
27106 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm11[1],zmm13[3],zmm11[3],zmm13[5],zmm11[5],zmm13[7],zmm11[7]
27107 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27108 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1
27109 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm12, %zmm13
27110 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27111 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm1
27112 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27113 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm20
27114 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm0
27115 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27116 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
27117 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17
27118 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0
27119 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
27120 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm5, %zmm1
27121 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm10
27122 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm30[1],zmm26[1],zmm30[3],zmm26[3],zmm30[5],zmm26[5],zmm30[7],zmm26[7]
27123 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27124 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm1
27125 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm26
27126 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm12, %zmm26
27127 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm1
27128 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27129 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm17
27130 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm0
27131 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27132 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm19
27133 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm31
27134 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm3
27135 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm5, %zmm16
27136 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
27137 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm24[1],zmm6[1],zmm24[3],zmm6[3],zmm24[5],zmm6[5],zmm24[7],zmm6[7]
27138 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27139 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm0
27140 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm29
27141 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm29
27142 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0
27143 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27144 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm19
27145 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm31
27146 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
27147 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm16
27148 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27
27149 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm1
27150 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7]
27151 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
27152 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1
27153 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm24
27154 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm24
27155 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm1
27156 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27157 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm12, %zmm16
27158 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm2, %zmm27
27159 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
27160 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm18
27161 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30
27162 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
27163 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm1
27164 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27165 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
27166 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm10[1],zmm0[1],zmm10[3],zmm0[3],zmm10[5],zmm0[5],zmm10[7],zmm0[7]
27167 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27168 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1
27169 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm10
27170 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27171 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
27172 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27173 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm18
27174 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm30
27175 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm6
27176 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm23
27177 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
27178 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm6
27179 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
27180 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
27181 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm21[1],zmm28[1],zmm21[3],zmm28[3],zmm21[5],zmm28[5],zmm21[7],zmm28[7]
27182 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm28
27183 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm1
27184 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm21
27185 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
27186 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27187 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm23
27188 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm22
27189 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
27190 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm0
27191 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
27192 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm0
27193 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27194 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm12
27195 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3
27196 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm3
27197 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27198 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm2
27199 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm15
27200 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm14[1],zmm8[1],zmm14[3],zmm8[3],zmm14[5],zmm8[5],zmm14[7],zmm8[7]
27201 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm25
27202 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
27203 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload
27204 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27205 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload
27206 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
27207 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload
27208 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
27209 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload
27210 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
27211 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 # 64-byte Folded Reload
27212 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
27213 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload
27214 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
27215 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload
27216 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
27217 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload
27218 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
27219 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload
27220 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
27221 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload
27222 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
27223 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload
27224 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
27225 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
27226 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27227 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
27228 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
27229 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27230 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
27231 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
27232 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27233 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
27234 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
27235 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
27236 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
27237 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
27238 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
27239 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
27240 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27241 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm8[4,5,6,7]
27242 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
27243 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
27244 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27245 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7]
27246 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
27247 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
27248 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27249 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5,6,7]
27250 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
27251 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
27252 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
27253 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5,6,7]
27254 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
27255 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
27256 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27257 ; AVX512DQ-BW-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload
27258 ; AVX512DQ-BW-FCP-NEXT: # ymm0 = mem[0,1,2,3],ymm15[4,5,6,7]
27259 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0
27260 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27261 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27262 ; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
27263 ; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
27264 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm25
27265 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm10[4,5,6,7]
27266 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
27267 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
27268 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27269 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1}
27270 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm8
27271 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm8, %ymm8
27272 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
27273 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0
27274 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm0[0],ymm8[0],ymm0[2],ymm8[2]
27275 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm28, %zmm17, %zmm26
27276 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
27277 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1}
27278 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 576(%rdi), %xmm13
27279 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm13, %ymm13
27280 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %xmm28
27281 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 640(%rdi), %ymm28, %ymm28
27282 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm28[0],ymm13[0],ymm28[2],ymm13[2]
27283 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm1
27284 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
27285 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1}
27286 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1088(%rdi), %xmm4
27287 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1216(%rdi), %ymm4, %ymm4
27288 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1024(%rdi), %xmm7
27289 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1152(%rdi), %ymm7, %ymm7
27290 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm7[0],ymm4[0],ymm7[2],ymm4[2]
27291 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm24
27292 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm19 {%k1}
27293 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1600(%rdi), %xmm5
27294 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1728(%rdi), %ymm5, %ymm5
27295 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1536(%rdi), %xmm11
27296 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1664(%rdi), %ymm11, %ymm11
27297 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm11[0],ymm5[0],ymm11[2],ymm5[2]
27298 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm29, %zmm19, %zmm19
27299 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1}
27300 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 2112(%rdi), %xmm6
27301 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 2240(%rdi), %ymm6, %ymm6
27302 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2048(%rdi), %xmm29
27303 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm29, %ymm29
27304 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm29[0],ymm6[0],ymm29[2],ymm6[2]
27305 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3
27306 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
27307 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1}
27308 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 2624(%rdi), %xmm10
27309 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 2752(%rdi), %ymm10, %ymm10
27310 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2560(%rdi), %xmm23
27311 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm23, %ymm23
27312 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm23[0],ymm10[0],ymm23[2],ymm10[2]
27313 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm18, %zmm18
27314 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
27315 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
27316 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1}
27317 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3136(%rdi), %xmm17
27318 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 3264(%rdi), %ymm17, %ymm17
27319 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3072(%rdi), %xmm20
27320 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20
27321 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm20[0],ymm17[0],ymm20[2],ymm17[2]
27322 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm1, %zmm15
27323 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
27324 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1}
27325 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 3648(%rdi), %xmm9
27326 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 3776(%rdi), %ymm9, %ymm9
27327 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 3584(%rdi), %xmm14
27328 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 3712(%rdi), %ymm14, %ymm14
27329 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm14[0],ymm9[0],ymm14[2],ymm9[2]
27330 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm1
27331 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
27332 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
27333 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 {%k1}
27334 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm20[1],ymm17[1],ymm20[3],ymm17[3]
27335 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm16
27336 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
27337 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
27338 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1}
27339 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm28[1],ymm13[1],ymm28[3],ymm13[3]
27340 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm12, %zmm13
27341 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
27342 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
27343 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1}
27344 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3]
27345 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0
27346 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
27347 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm31 {%k1}
27348 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3]
27349 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm5
27350 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
27351 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1}
27352 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],ymm4[1],ymm7[3],ymm4[3]
27353 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4
27354 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
27355 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1}
27356 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm10[1],ymm23[3],ymm10[3]
27357 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm30, %zmm7
27358 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
27359 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1}
27360 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm29[1],ymm6[1],ymm29[3],ymm6[3]
27361 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6
27362 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
27363 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1}
27364 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm14[1],ymm9[1],ymm14[3],ymm9[3]
27365 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2
27366 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 448(%rsi)
27367 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 384(%rsi)
27368 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 320(%rsi)
27369 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 256(%rsi)
27370 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 192(%rsi)
27371 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 128(%rsi)
27372 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
27373 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 64(%rsi)
27374 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, (%rsi)
27375 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 448(%rdx)
27376 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 256(%rdx)
27377 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 320(%rdx)
27378 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 128(%rdx)
27379 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 192(%rdx)
27380 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
27381 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rdx)
27382 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 384(%rdx)
27383 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27384 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 448(%rcx)
27385 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27386 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 256(%rcx)
27387 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27388 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 320(%rcx)
27389 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27390 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 128(%rcx)
27391 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27392 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 192(%rcx)
27393 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27394 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%rcx)
27395 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27396 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 64(%rcx)
27397 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27398 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 384(%rcx)
27399 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27400 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 448(%r8)
27401 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27402 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 256(%r8)
27403 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27404 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 320(%r8)
27405 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27406 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 128(%r8)
27407 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27408 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 192(%r8)
27409 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27410 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%r8)
27411 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27412 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 64(%r8)
27413 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27414 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 384(%r8)
27415 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27416 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 448(%r9)
27417 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27418 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 256(%r9)
27419 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27420 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 320(%r9)
27421 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27422 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 128(%r9)
27423 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27424 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 192(%r9)
27425 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27426 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%r9)
27427 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27428 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 64(%r9)
27429 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27430 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 384(%r9)
27431 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
27432 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27433 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 448(%rax)
27434 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27435 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 256(%rax)
27436 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27437 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 320(%rax)
27438 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27439 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 128(%rax)
27440 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27441 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 192(%rax)
27442 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27443 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%rax)
27444 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27445 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 64(%rax)
27446 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27447 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 384(%rax)
27448 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
27449 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27450 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 448(%rax)
27451 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27452 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 256(%rax)
27453 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27454 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 320(%rax)
27455 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27456 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 128(%rax)
27457 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27458 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 192(%rax)
27459 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27460 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%rax)
27461 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27462 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 64(%rax)
27463 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27464 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 384(%rax)
27465 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
27466 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27467 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 384(%rax)
27468 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 448(%rax)
27469 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27470 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 256(%rax)
27471 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27472 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 320(%rax)
27473 ; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
27474 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 128(%rax)
27475 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27476 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 192(%rax)
27477 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27478 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%rax)
27479 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
27480 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 64(%rax)
27481 ; AVX512DQ-BW-FCP-NEXT: addq $6664, %rsp # imm = 0x1A08
27482 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
27483 ; AVX512DQ-BW-FCP-NEXT: retq
27484 %wide.vec = load <512 x i64>, ptr %in.vec, align 64
27485 %strided.vec0 = shufflevector <512 x i64> %wide.vec, <512 x i64> poison, <64 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120, i32 128, i32 136, i32 144, i32 152, i32 160, i32 168, i32 176, i32 184, i32 192, i32 200, i32 208, i32 216, i32 224, i32 232, i32 240, i32 248, i32 256, i32 264, i32 272, i32 280, i32 288, i32 296, i32 304, i32 312, i32 320, i32 328, i32 336, i32 344, i32 352, i32 360, i32 368, i32 376, i32 384, i32 392, i32 400, i32 408, i32 416, i32 424, i32 432, i32 440, i32 448, i32 456, i32 464, i32 472, i32 480, i32 488, i32 496, i32 504>
27486 %strided.vec1 = shufflevector <512 x i64> %wide.vec, <512 x i64> poison, <64 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 65, i32 73, i32 81, i32 89, i32 97, i32 105, i32 113, i32 121, i32 129, i32 137, i32 145, i32 153, i32 161, i32 169, i32 177, i32 185, i32 193, i32 201, i32 209, i32 217, i32 225, i32 233, i32 241, i32 249, i32 257, i32 265, i32 273, i32 281, i32 289, i32 297, i32 305, i32 313, i32 321, i32 329, i32 337, i32 345, i32 353, i32 361, i32 369, i32 377, i32 385, i32 393, i32 401, i32 409, i32 417, i32 425, i32 433, i32 441, i32 449, i32 457, i32 465, i32 473, i32 481, i32 489, i32 497, i32 505>
27487 %strided.vec2 = shufflevector <512 x i64> %wide.vec, <512 x i64> poison, <64 x i32> <i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58, i32 66, i32 74, i32 82, i32 90, i32 98, i32 106, i32 114, i32 122, i32 130, i32 138, i32 146, i32 154, i32 162, i32 170, i32 178, i32 186, i32 194, i32 202, i32 210, i32 218, i32 226, i32 234, i32 242, i32 250, i32 258, i32 266, i32 274, i32 282, i32 290, i32 298, i32 306, i32 314, i32 322, i32 330, i32 338, i32 346, i32 354, i32 362, i32 370, i32 378, i32 386, i32 394, i32 402, i32 410, i32 418, i32 426, i32 434, i32 442, i32 450, i32 458, i32 466, i32 474, i32 482, i32 490, i32 498, i32 506>
27488 %strided.vec3 = shufflevector <512 x i64> %wide.vec, <512 x i64> poison, <64 x i32> <i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59, i32 67, i32 75, i32 83, i32 91, i32 99, i32 107, i32 115, i32 123, i32 131, i32 139, i32 147, i32 155, i32 163, i32 171, i32 179, i32 187, i32 195, i32 203, i32 211, i32 219, i32 227, i32 235, i32 243, i32 251, i32 259, i32 267, i32 275, i32 283, i32 291, i32 299, i32 307, i32 315, i32 323, i32 331, i32 339, i32 347, i32 355, i32 363, i32 371, i32 379, i32 387, i32 395, i32 403, i32 411, i32 419, i32 427, i32 435, i32 443, i32 451, i32 459, i32 467, i32 475, i32 483, i32 491, i32 499, i32 507>
27489 %strided.vec4 = shufflevector <512 x i64> %wide.vec, <512 x i64> poison, <64 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60, i32 68, i32 76, i32 84, i32 92, i32 100, i32 108, i32 116, i32 124, i32 132, i32 140, i32 148, i32 156, i32 164, i32 172, i32 180, i32 188, i32 196, i32 204, i32 212, i32 220, i32 228, i32 236, i32 244, i32 252, i32 260, i32 268, i32 276, i32 284, i32 292, i32 300, i32 308, i32 316, i32 324, i32 332, i32 340, i32 348, i32 356, i32 364, i32 372, i32 380, i32 388, i32 396, i32 404, i32 412, i32 420, i32 428, i32 436, i32 444, i32 452, i32 460, i32 468, i32 476, i32 484, i32 492, i32 500, i32 508>
27490 %strided.vec5 = shufflevector <512 x i64> %wide.vec, <512 x i64> poison, <64 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 69, i32 77, i32 85, i32 93, i32 101, i32 109, i32 117, i32 125, i32 133, i32 141, i32 149, i32 157, i32 165, i32 173, i32 181, i32 189, i32 197, i32 205, i32 213, i32 221, i32 229, i32 237, i32 245, i32 253, i32 261, i32 269, i32 277, i32 285, i32 293, i32 301, i32 309, i32 317, i32 325, i32 333, i32 341, i32 349, i32 357, i32 365, i32 373, i32 381, i32 389, i32 397, i32 405, i32 413, i32 421, i32 429, i32 437, i32 445, i32 453, i32 461, i32 469, i32 477, i32 485, i32 493, i32 501, i32 509>
27491 %strided.vec6 = shufflevector <512 x i64> %wide.vec, <512 x i64> poison, <64 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 70, i32 78, i32 86, i32 94, i32 102, i32 110, i32 118, i32 126, i32 134, i32 142, i32 150, i32 158, i32 166, i32 174, i32 182, i32 190, i32 198, i32 206, i32 214, i32 222, i32 230, i32 238, i32 246, i32 254, i32 262, i32 270, i32 278, i32 286, i32 294, i32 302, i32 310, i32 318, i32 326, i32 334, i32 342, i32 350, i32 358, i32 366, i32 374, i32 382, i32 390, i32 398, i32 406, i32 414, i32 422, i32 430, i32 438, i32 446, i32 454, i32 462, i32 470, i32 478, i32 486, i32 494, i32 502, i32 510>
27492 %strided.vec7 = shufflevector <512 x i64> %wide.vec, <512 x i64> poison, <64 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63, i32 71, i32 79, i32 87, i32 95, i32 103, i32 111, i32 119, i32 127, i32 135, i32 143, i32 151, i32 159, i32 167, i32 175, i32 183, i32 191, i32 199, i32 207, i32 215, i32 223, i32 231, i32 239, i32 247, i32 255, i32 263, i32 271, i32 279, i32 287, i32 295, i32 303, i32 311, i32 319, i32 327, i32 335, i32 343, i32 351, i32 359, i32 367, i32 375, i32 383, i32 391, i32 399, i32 407, i32 415, i32 423, i32 431, i32 439, i32 447, i32 455, i32 463, i32 471, i32 479, i32 487, i32 495, i32 503, i32 511>
27493 store <64 x i64> %strided.vec0, ptr %out.vec0, align 64
27494 store <64 x i64> %strided.vec1, ptr %out.vec1, align 64
27495 store <64 x i64> %strided.vec2, ptr %out.vec2, align 64
27496 store <64 x i64> %strided.vec3, ptr %out.vec3, align 64
27497 store <64 x i64> %strided.vec4, ptr %out.vec4, align 64
27498 store <64 x i64> %strided.vec5, ptr %out.vec5, align 64
27499 store <64 x i64> %strided.vec6, ptr %out.vec6, align 64
27500 store <64 x i64> %strided.vec7, ptr %out.vec7, align 64